xref: /petsc/src/mat/impls/aij/seq/matptap.c (revision 5a7d977cdd9ec927d1da53f31d29316a4d34d48d)
1 /*
2   Defines projective product routines where A is a AIJ matrix
3           C = P^T * A * P
4 */
5 
6 #include "src/mat/impls/aij/seq/aij.h"   /*I "petscmat.h" I*/
7 #include "src/mat/utils/freespace.h"
8 #include "src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatPtAP"
13 /*@
14    MatPtAP - Creates the matrix projection C = P^T * A * P
15 
16    Collective on Mat
17 
18    Input Parameters:
19 +  A - the matrix
20 .  P - the projection matrix
21 .  scall - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
22 -  fill - expected fill as ratio of nnz(C)/(nnz(A) + nnz(P))
23 
24    Output Parameters:
25 .  C - the product matrix
26 
27    Notes:
28    C will be created and must be destroyed by the user with MatDestroy().
29 
30    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
31    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.
32 
33    Level: intermediate
34 
35 .seealso: MatPtAPSymbolic(),MatPtAPNumeric(),MatMatMult()
36 @*/
37 PetscErrorCode MatPtAP(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
38 {
39   PetscErrorCode ierr;
40   PetscErrorCode (*fA)(Mat,Mat,MatReuse,PetscReal,Mat *);
41   PetscErrorCode (*fP)(Mat,Mat,MatReuse,PetscReal,Mat *);
42 
43   PetscFunctionBegin;
44   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
45   PetscValidType(A,1);
46   MatPreallocated(A);
47   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
48   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
49   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
50   PetscValidType(P,2);
51   MatPreallocated(P);
52   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
53   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
54   PetscValidPointer(C,3);
55 
56   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
57 
58   if (fill <=0.0) SETERRQ1(PETSC_ERR_ARG_SIZ,"fill=%g must be > 0.0",fill);
59 
60   /* For now, we do not dispatch based on the type of A and P */
61   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
62   fA = A->ops->ptap;
63   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for A of type %s",A->type_name);
64   fP = P->ops->ptap;
65   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for P of type %s",P->type_name);
66   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAP requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
67 
68   ierr = PetscLogEventBegin(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
69   ierr = (*fA)(A,P,scall,fill,C);CHKERRQ(ierr);
70   ierr = PetscLogEventEnd(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
71   PetscFunctionReturn(0);
72 }
73 
74 EXTERN PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,PetscReal,Mat*);
75 EXTERN PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,Mat C);
76 
77 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
78 #undef __FUNCT__
79 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
80 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
81 {
82   PetscErrorCode       ierr;
83   Mat_MatMatMultMPI    *ptap;
84   PetscObjectContainer container;
85 
86   PetscFunctionBegin;
87   ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
88   if (container) {
89     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
90     ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr);
91     ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr);
92     ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr);
93     ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr);
94 
95     ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
96     ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr);
97   }
98   ierr = PetscFree(ptap);CHKERRQ(ierr);
99 
100   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
101   PetscFunctionReturn(0);
102 }
103 
104 #undef __FUNCT__
105 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
106 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
107 {
108   PetscErrorCode       ierr;
109   Mat_MatMatMultMPI    *ptap;
110   PetscObjectContainer container;
111 
112   PetscFunctionBegin;
113   if (scall == MAT_INITIAL_MATRIX){
114     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
115     ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr);
116 
117     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
118     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
119 
120     /* get P_loc by taking all local rows of P */
121     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
122 
123     /* attach the supporting struct to P for reuse */
124     P->ops->destroy  = MatDestroy_MPIAIJ_PtAP;
125     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
126     ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr);
127     ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr);
128 
129     /* now, compute symbolic local P^T*A*P */
130     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */
131     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
132   } else if (scall == MAT_REUSE_MATRIX){
133     ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
134     if (container) {
135       ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
136     } else {
137       SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
138     }
139 
140     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
141     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
142 
143     /* get P_loc by taking all local rows of P */
144     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
145 
146   } else {
147     SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall);
148   }
149   /* now, compute numeric local P^T*A*P */
150   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
151   ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
152   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
153 
154   PetscFunctionReturn(0);
155 }
156 
157 #undef __FUNCT__
158 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
159 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
160 {
161   PetscErrorCode       ierr;
162   Mat                  P_loc,P_oth;
163   Mat_MatMatMultMPI    *ptap;
164   PetscObjectContainer container;
165   FreeSpaceList        free_space=PETSC_NULL,current_space=PETSC_NULL;
166   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data;
167   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
168   Mat_SeqAIJ           *p_loc,*p_oth;
169   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*ajj,*pjj;
170   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
171   PetscInt             pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji,*ptaj;
172   PetscInt             aN=A->N,am=A->m,pN=P->N;
173   PetscInt             i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi;
174   PetscBT              lnkbt;
175   PetscInt             prstart,prend,nprow_loc,nprow_oth;
176   PetscInt             *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
177 
178   MPI_Comm             comm=A->comm;
179   Mat                  B_mpi;
180   PetscMPIInt          size,rank,tagi,tagj,*len_s,*len_si,*len_ri;
181   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
182   PetscInt             len,proc,*dnz,*onz,*owners;
183   PetscInt             anzi,*bi,*bj,bnzi,nspacedouble=0;
184   PetscInt             nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
185   MPI_Request          *si_waits,*sj_waits,*ri_waits,*rj_waits;
186   MPI_Status           *status;
187   Mat_Merge_SeqsToMPI  *merge;
188 
189   PetscFunctionBegin;
190   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
191   if (container) {
192     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
193   } else {
194     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
195   }
196   /* compute symbolic C_seq = P_loc^T * A_loc * P_seq */
197   /*--------------------------------------------------*/
198   /* get data from P_loc and P_oth */
199   P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart;
200   p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
201   pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
202   prend = prstart + pm;
203 
204   /* get ij structure of P_loc^T */
205   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
206   ptJ=ptj;
207 
208   /* Allocate ci array, arrays for fill computation and */
209   /* free space for accumulating nonzero column info */
210   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
211   ci[0] = 0;
212 
213   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
214   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
215   ptasparserow_loc = ptadenserow_loc + aN;
216   ptadenserow_oth  = ptasparserow_loc + aN;
217   ptasparserow_oth = ptadenserow_oth + aN;
218 
219   /* create and initialize a linked list */
220   nlnk = pN+1;
221   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
222 
223   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
224   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
225   nnz           = adi[am] + aoi[am];
226   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
227   current_space = free_space;
228 
229   /* determine symbolic info for each row of C: */
230   for (i=0;i<pN;i++) {
231     ptnzi  = pti[i+1] - pti[i];
232     nprow_loc = 0; nprow_oth = 0;
233     /* i-th row of symbolic P_loc^T*A_loc: */
234     for (j=0;j<ptnzi;j++) {
235       arow = *ptJ++;
236       /* diagonal portion of A */
237       anzj = adi[arow+1] - adi[arow];
238       ajj  = adj + adi[arow];
239       for (k=0;k<anzj;k++) {
240         col = ajj[k]+prstart;
241         if (!ptadenserow_loc[col]) {
242           ptadenserow_loc[col]    = -1;
243           ptasparserow_loc[nprow_loc++] = col;
244         }
245       }
246       /* off-diagonal portion of A */
247       anzj = aoi[arow+1] - aoi[arow];
248       ajj  = aoj + aoi[arow];
249       for (k=0;k<anzj;k++) {
250         col = a->garray[ajj[k]];  /* global col */
251         if (col < cstart){
252           col = ajj[k];
253         } else if (col >= cend){
254           col = ajj[k] + pm;
255         } else {
256           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
257         }
258         if (!ptadenserow_oth[col]) {
259           ptadenserow_oth[col]    = -1;
260           ptasparserow_oth[nprow_oth++] = col;
261         }
262       }
263     }
264 
265     /* using symbolic info of local PtA, determine symbolic info for row of C: */
266     cnzi   = 0;
267     /* rows of P_loc */
268     ptaj = ptasparserow_loc;
269     for (j=0; j<nprow_loc; j++) {
270       prow = *ptaj++;
271       prow -= prstart; /* rm */
272       pnzj = pi_loc[prow+1] - pi_loc[prow];
273       pjj  = pj_loc + pi_loc[prow];
274       /* add non-zero cols of P into the sorted linked list lnk */
275       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
276       cnzi += nlnk;
277     }
278     /* rows of P_oth */
279     ptaj = ptasparserow_oth;
280     for (j=0; j<nprow_oth; j++) {
281       prow = *ptaj++;
282       if (prow >= prend) prow -= pm; /* rm */
283       pnzj = pi_oth[prow+1] - pi_oth[prow];
284       pjj  = pj_oth + pi_oth[prow];
285       /* add non-zero cols of P into the sorted linked list lnk */
286       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
287       cnzi += nlnk;
288     }
289 
290     /* If free space is not available, make more free space */
291     /* Double the amount of total space in the list */
292     if (current_space->local_remaining<cnzi) {
293       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
294     }
295 
296     /* Copy data into free space, and zero out denserows */
297     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
298     current_space->array           += cnzi;
299     current_space->local_used      += cnzi;
300     current_space->local_remaining -= cnzi;
301 
302     for (j=0;j<nprow_loc; j++) {
303       ptadenserow_loc[ptasparserow_loc[j]] = 0;
304     }
305     for (j=0;j<nprow_oth; j++) {
306       ptadenserow_oth[ptasparserow_oth[j]] = 0;
307     }
308 
309     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
310     /*        For now, we will recompute what is needed. */
311     ci[i+1] = ci[i] + cnzi;
312   }
313   /* Clean up. */
314   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
315 
316   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
317   /* Allocate space for cj, initialize cj, and */
318   /* destroy list of free space and other temporary array(s) */
319   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
320   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
321   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
322   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
323 
324   /* Allocate space for ca */
325   /*
326   ierr = PetscMalloc((ci[pN]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
327   ierr = PetscMemzero(ca,(ci[pN]+1)*sizeof(MatScalar));CHKERRQ(ierr);
328   */
329 
330   /* add C_seq into mpi C              */
331   /*-----------------------------------*/
332   free_space=PETSC_NULL; current_space=PETSC_NULL;
333 
334   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
335   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
336 
337   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
338 
339 
340   /* determine row ownership */
341   /*---------------------------------------------------------*/
342   ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr);
343   ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr);
344   ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr);
345   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
346 
347   /* determine the number of messages to send, their lengths */
348   /*---------------------------------------------------------*/
349   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
350   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
351   len_s  = merge->len_s;
352   len = 0;  /* length of buf_si[] */
353   merge->nsend = 0;
354   for (proc=0; proc<size; proc++){
355     len_si[proc] = 0;
356     if (proc == rank){
357       len_s[proc] = 0;
358     } else {
359       len_si[proc] = owners[proc+1] - owners[proc] + 1;
360       len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of rows to be sent to [proc] */
361     }
362     if (len_s[proc]) {
363       merge->nsend++;
364       nrows = 0;
365       for (i=owners[proc]; i<owners[proc+1]; i++){
366         if (ci[i+1] > ci[i]) nrows++;
367       }
368       len_si[proc] = 2*(nrows+1);
369       len += len_si[proc];
370     }
371   }
372 
373   /* determine the number and length of messages to receive for ij-structure */
374   /*-------------------------------------------------------------------------*/
375   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
376   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
377 
378   /* post the Irecv of j-structure */
379   /*-------------------------------*/
380   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr);
381   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr);
382 
383   /* post the Isend of j-structure */
384   /*--------------------------------*/
385   ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr);
386   sj_waits = si_waits + merge->nsend;
387 
388   for (proc=0, k=0; proc<size; proc++){
389     if (!len_s[proc]) continue;
390     i = owners[proc];
391     ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr);
392     k++;
393   }
394 
395   /* receives and sends of j-structure are complete */
396   /*------------------------------------------------*/
397   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
398   ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr);
399   ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr);
400 
401   /* send and recv i-structure */
402   /*---------------------------*/
403   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr);
404   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr);
405 
406   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
407   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
408   for (proc=0,k=0; proc<size; proc++){
409     if (!len_s[proc]) continue;
410     /* form outgoing message for i-structure:
411          buf_si[0]:                 nrows to be sent
412                [1:nrows]:           row index (global)
413                [nrows+1:2*nrows+1]: i-structure index
414     */
415     /*-------------------------------------------*/
416     nrows = len_si[proc]/2 - 1;
417     buf_si_i    = buf_si + nrows+1;
418     buf_si[0]   = nrows;
419     buf_si_i[0] = 0;
420     nrows = 0;
421     for (i=owners[proc]; i<owners[proc+1]; i++){
422       anzi = ci[i+1] - ci[i];
423       if (anzi) {
424         buf_si_i[nrows+1] = buf_si_i[nrows] + anzi; /* i-structure */
425         buf_si[nrows+1] = i-owners[proc]; /* local row index */
426         nrows++;
427       }
428     }
429     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr);
430     k++;
431     buf_si += len_si[proc];
432   }
433 
434   ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr);
435   ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr);
436 
437   ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr);
438   for (i=0; i<merge->nrecv; i++){
439     ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI:   recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr);
440   }
441 
442   ierr = PetscFree(len_si);CHKERRQ(ierr);
443   ierr = PetscFree(len_ri);CHKERRQ(ierr);
444   ierr = PetscFree(rj_waits);CHKERRQ(ierr);
445   ierr = PetscFree(si_waits);CHKERRQ(ierr);
446   ierr = PetscFree(ri_waits);CHKERRQ(ierr);
447   ierr = PetscFree(buf_s);CHKERRQ(ierr);
448   ierr = PetscFree(status);CHKERRQ(ierr);
449 
450   /* compute a local seq matrix in each processor */
451   /*----------------------------------------------*/
452   /* allocate bi array and free space for accumulating nonzero column info */
453   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
454   bi[0] = 0;
455 
456   /* create and initialize a linked list */
457   nlnk = pN+1;
458   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
459 
460   /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */
461   len = 0;
462   len  = ci[owners[rank+1]] - ci[owners[rank]];
463   ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr);
464   current_space = free_space;
465 
466   /* determine symbolic info for each local row */
467   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
468   nextrow = buf_ri_k + merge->nrecv;
469   nextci  = nextrow + merge->nrecv;
470   for (k=0; k<merge->nrecv; k++){
471     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
472     nrows = *buf_ri_k[k];
473     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
474     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
475   }
476 
477   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
478   len = 0;
479   for (i=0; i<pn; i++) {
480     bnzi   = 0;
481     /* add local non-zero cols of this proc's C_seq into lnk */
482     arow   = owners[rank] + i;
483     anzi   = ci[arow+1] - ci[arow];
484     cji    = cj + ci[arow];
485     ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
486     bnzi += nlnk;
487     /* add received col data into lnk */
488     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
489       if (i == *nextrow[k]) { /* i-th row */
490         anzi = *(nextci[k]+1) - *nextci[k];
491         cji  = buf_rj[k] + *nextci[k];
492         ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
493         bnzi += nlnk;
494         nextrow[k]++; nextci[k]++;
495       }
496     }
497     if (len < bnzi) len = bnzi;  /* =max(bnzi) */
498 
499     /* if free space is not available, make more free space */
500     if (current_space->local_remaining<bnzi) {
501       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
502       nspacedouble++;
503     }
504     /* copy data into free space, then initialize lnk */
505     ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
506     ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr);
507 
508     current_space->array           += bnzi;
509     current_space->local_used      += bnzi;
510     current_space->local_remaining -= bnzi;
511 
512     bi[i+1] = bi[i] + bnzi;
513   }
514 
515   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
516 
517   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
518   ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
519   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
520 
521   /* create symbolic parallel matrix B_mpi */
522   /*---------------------------------------*/
523   ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr);
524   ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr);
525   ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr);
526   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
527 
528   /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */
529   B_mpi->assembled     = PETSC_FALSE;
530   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_SeqsToMPI;
531   merge->bi            = bi;
532   merge->bj            = bj;
533   merge->ci            = ci;
534   merge->cj            = cj;
535   merge->buf_ri        = buf_ri;
536   merge->buf_rj        = buf_rj;
537   merge->C_seq         = PETSC_NULL;
538 
539   /* attach the supporting struct to B_mpi for reuse */
540   ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
541   ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr);
542   ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr);
543   *C = B_mpi;
544 
545   PetscFunctionReturn(0);
546 }
547 
548 #undef __FUNCT__
549 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
550 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
551 {
552   PetscErrorCode       ierr;
553   Mat_Merge_SeqsToMPI  *merge;
554   Mat_MatMatMultMPI    *ptap;
555   PetscObjectContainer cont_merge,cont_ptap;
556 
557   PetscInt       flops=0;
558   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
559   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
560   Mat_SeqAIJ     *p_loc,*p_oth;
561   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col;
562   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pjj;
563   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
564   PetscInt       *cjj;
565   MatScalar      *ada=ad->a,*aoa=ao->a,*apa,*paj,*cseqa,*caj;
566   MatScalar      *pa_loc,*pA,*pa_oth;
567   PetscInt       am=A->m,cN=C->N;
568 
569   MPI_Comm             comm=C->comm;
570   PetscMPIInt          size,rank,taga,*len_s;
571   PetscInt             *owners;
572   PetscInt             proc;
573   PetscInt             **buf_ri,**buf_rj;
574   PetscInt             cseqnzi,*bj_i,*bi,*bj,cseqrow,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */
575   PetscInt             nrows,**buf_ri_k,**nextrow,**nextcseqi;
576   MPI_Request          *s_waits,*r_waits;
577   MPI_Status           *status;
578   MatScalar            **abuf_r,*ba_i;
579   PetscInt             *cseqi,*cseqj;
580   PetscInt             *cseqj_tmp;
581   MatScalar            *cseqa_tmp;
582 
583   PetscFunctionBegin;
584   ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr);
585   if (cont_merge) {
586     ierr  = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr);
587   } else {
588     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
589   }
590   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr);
591   if (cont_ptap) {
592     ierr  = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr);
593   } else {
594     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
595   }
596   /* get data from symbolic products */
597   p_loc=(Mat_SeqAIJ*)(ptap->B_loc)->data;
598   p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data;
599   pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a; pA=pa_loc;
600   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
601 
602   cseqi = merge->ci; cseqj=merge->cj;
603   ierr  = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr);
604   ierr  = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
605 
606   /* Allocate temporary array for storage of one row of A*P */
607   ierr = PetscMalloc(cN*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
608   ierr = PetscMemzero(apa,cN*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
609   apj      = (PetscInt *)(apa + cN);
610   apjdense = apj + cN;
611 
612   /* Clear old values in C_Seq */
613   ierr = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
614 
615   for (i=0;i<am;i++) {
616     /* Form i-th sparse row of A*P */
617      apnzj = 0;
618     /* diagonal portion of A */
619     anzi  = adi[i+1] - adi[i];
620     for (j=0;j<anzi;j++) {
621       prow = *adj;
622       adj++;
623       pnzj = pi_loc[prow+1] - pi_loc[prow];
624       pjj  = pj_loc + pi_loc[prow];
625       paj  = pa_loc + pi_loc[prow];
626       for (k=0;k<pnzj;k++) {
627         if (!apjdense[pjj[k]]) {
628           apjdense[pjj[k]] = -1;
629           apj[apnzj++]     = pjj[k];
630         }
631         apa[pjj[k]] += (*ada)*paj[k];
632       }
633       flops += 2*pnzj;
634       ada++;
635     }
636     /* off-diagonal portion of A */
637     anzi  = aoi[i+1] - aoi[i];
638     for (j=0;j<anzi;j++) {
639       col = a->garray[*aoj];
640       if (col < cstart){
641         prow = *aoj;
642       } else if (col >= cend){
643         prow = *aoj;
644       } else {
645         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
646       }
647       aoj++;
648       pnzj = pi_oth[prow+1] - pi_oth[prow];
649       pjj  = pj_oth + pi_oth[prow];
650       paj  = pa_oth + pi_oth[prow];
651       for (k=0;k<pnzj;k++) {
652         if (!apjdense[pjj[k]]) {
653           apjdense[pjj[k]] = -1;
654           apj[apnzj++]     = pjj[k];
655         }
656         apa[pjj[k]] += (*aoa)*paj[k];
657       }
658       flops += 2*pnzj;
659       aoa++;
660     }
661     /* Sort the j index array for quick sparse axpy. */
662     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
663 
664     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
665     pnzi = pi_loc[i+1] - pi_loc[i];
666     for (j=0;j<pnzi;j++) {
667       nextap = 0;
668       crow   = *pJ++;
669       cjj    = cseqj + cseqi[crow];
670       caj    = cseqa + cseqi[crow];
671       /* Perform sparse axpy operation.  Note cjj includes apj. */
672       for (k=0;nextap<apnzj;k++) {
673         if (cjj[k]==apj[nextap]) {
674           caj[k] += (*pA)*apa[apj[nextap++]];
675         }
676       }
677       flops += 2*apnzj;
678       pA++;
679     }
680 
681     /* Zero the current row info for A*P */
682     for (j=0;j<apnzj;j++) {
683       apa[apj[j]]      = 0.;
684       apjdense[apj[j]] = 0;
685     }
686   }
687 
688   ierr = PetscFree(apa);CHKERRQ(ierr);
689 
690   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
691   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
692 
693   bi     = merge->bi;
694   bj     = merge->bj;
695   buf_ri = merge->buf_ri;
696   buf_rj = merge->buf_rj;
697 
698   ierr   = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
699   ierr   = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
700 
701   /* send and recv matrix values */
702   /*-----------------------------*/
703   len_s  = merge->len_s;
704   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr);
705   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
706 
707   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr);
708   for (proc=0,k=0; proc<size; proc++){
709     if (!len_s[proc]) continue;
710     i = owners[proc];
711     ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
712     k++;
713   }
714 
715   ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);
716   ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);
717   ierr = PetscFree(status);CHKERRQ(ierr);
718 
719   ierr = PetscFree(s_waits);CHKERRQ(ierr);
720   ierr = PetscFree(r_waits);CHKERRQ(ierr);
721 
722   /* insert mat values of mpimat */
723   /*----------------------------*/
724   ierr = PetscMalloc(cN*sizeof(MatScalar),&ba_i);CHKERRQ(ierr);
725   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
726   nextrow = buf_ri_k + merge->nrecv;
727   nextcseqi  = nextrow + merge->nrecv;
728 
729   for (k=0; k<merge->nrecv; k++){
730     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
731     nrows = *(buf_ri_k[k]);
732     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
733     nextcseqi[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
734   }
735 
736   /* set values of ba */
737   for (i=0; i<C->m; i++) {
738     cseqrow = owners[rank] + i;
739     bj_i = bj+bi[i];  /* col indices of the i-th row of C */
740     bnzi = bi[i+1] - bi[i];
741     ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr);
742 
743     /* add local non-zero vals of this proc's C_seq into ba */
744     cseqnzi = cseqi[cseqrow+1] - cseqi[cseqrow];
745     cseqj_tmp = cseqj + cseqi[cseqrow];
746     cseqa_tmp = cseqa + cseqi[cseqrow];
747     nextcseqj = 0;
748     for (j=0; nextcseqj<cseqnzi; j++){
749       if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
750         ba_i[j] += cseqa_tmp[nextcseqj++];
751       }
752     }
753 
754     /* add received vals into ba */
755     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
756       /* i-th row */
757       if (i == *nextrow[k]) {
758         cseqnzi = *(nextcseqi[k]+1) - *nextcseqi[k];
759         cseqj_tmp   = buf_rj[k] + *(nextcseqi[k]);
760         cseqa_tmp   = abuf_r[k] + *(nextcseqi[k]);
761         nextcseqj = 0;
762         for (j=0; nextcseqj<cseqnzi; j++){
763           if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
764             ba_i[j] += cseqa_tmp[nextcseqj++];
765           }
766         }
767         nextrow[k]++; nextcseqi[k]++;
768       }
769     }
770     ierr = MatSetValues(C,1,&cseqrow,bnzi,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
771     flops += 2*bnzi;
772   }
773   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
774   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
775 
776   ierr = PetscFree(cseqa);CHKERRQ(ierr);
777   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
778   ierr = PetscFree(ba_i);CHKERRQ(ierr);
779   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
780   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
781 
782   PetscFunctionReturn(0);
783 }
784 
785 #ifdef TOBEREMOVED
786 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
787 #undef __FUNCT__
788 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
789 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
790 {
791   PetscErrorCode       ierr;
792   Mat_MatMatMultMPI    *ptap;
793   PetscObjectContainer container;
794 
795   PetscFunctionBegin;
796   ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
797   if (container) {
798     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
799     ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr);
800     ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr);
801     ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr);
802     ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr);
803 
804     ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
805     ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr);
806   }
807   ierr = PetscFree(ptap);CHKERRQ(ierr);
808 
809   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
810   PetscFunctionReturn(0);
811 }
812 
813 #undef __FUNCT__
814 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
815 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
816 {
817   PetscErrorCode       ierr;
818   Mat_MatMatMultMPI    *ptap;
819   PetscObjectContainer container;
820 
821   PetscFunctionBegin;
822   if (scall == MAT_INITIAL_MATRIX){
823     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
824     ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr);
825 
826     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
827     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
828 
829     /* get P_loc by taking all local rows of P */
830     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
831 
832     /* attach the supporting struct to P for reuse */
833     P->ops->destroy  = MatDestroy_MPIAIJ_PtAP;
834     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
835     ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr);
836     ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr);
837 
838     /* now, compute symbolic local P^T*A*P */
839     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */
840     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
841   } else if (scall == MAT_REUSE_MATRIX){
842     ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
843     if (container) {
844       ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
845     } else {
846       SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
847     }
848     /* update P_oth */
849     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
850 
851   } else {
852     SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall);
853   }
854   /* now, compute numeric local P^T*A*P */
855   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
856   ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
857   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
858 
859   PetscFunctionReturn(0);
860 }
861 /* Set symbolic info for i-th row of local product C=P^T*A*P */
862 #define MatPtAPSymbolicLocal_Private(ptM,pti,ptj,ci,cj) 0;\
863 {\
864   /* allocate ci array, arrays for fill computation and */\
865   /* free space for accumulating nonzero column info */\
866   ierr = PetscMalloc((ptM+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);\
867   ci[0] = 0;\
868 \
869   /* set initial free space to be nnz(A) scaled by fill*P->N/PtM. */\
870   /* this should be reasonable if sparsity of PtAP is similar to that of A. */\
871   nnz           = adi[am] + aoi[am];\
872   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/ptM+1),&free_space);\
873   current_space = free_space;\
874 \
875   ptJ=ptj;\
876   for (i=0; i<ptM; i++) {\
877     ptnzi  = pti[i+1] - pti[i];\
878     nprow_loc = 0; nprow_oth = 0;\
879     /* i-th row of symbolic Pt*A: */\
880     for (j=0;j<ptnzi;j++) {\
881       arow = *ptJ++;\
882       /* diagonal portion of A */\
883       anzj = adi[arow+1] - adi[arow];\
884       ajj  = adj + adi[arow];\
885       for (k=0;k<anzj;k++) {\
886         col = ajj[k]+prstart;\
887         if (!ptadenserow_loc[col]) {\
888           ptadenserow_loc[col]    = -1;\
889           ptasparserow_loc[nprow_loc++] = col;\
890         }\
891       }\
892       /* off-diagonal portion of A */\
893       anzj = aoi[arow+1] - aoi[arow];\
894       ajj  = aoj + aoi[arow];\
895       for (k=0;k<anzj;k++) {\
896         col = a->garray[ajj[k]];  /* global col */\
897         if (col < cstart){\
898           col = ajj[k];\
899         } else if (col >= cend){\
900           col = ajj[k] + pm;\
901         } else {\
902           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");\
903         }\
904         if (!ptadenserow_oth[col]) {\
905           ptadenserow_oth[col]    = -1;\
906           ptasparserow_oth[nprow_oth++] = col;\
907         }\
908       }\
909     }\
910     /* determine symbolic info for row of C_seq: */\
911     cnzi   = 0;\
912     /* rows of P_loc */\
913     ptaj = ptasparserow_loc;\
914     for (j=0; j<nprow_loc; j++) {\
915       prow = *ptaj++; \
916       prow -= prstart; /* rm */\
917       pnzj = pi_loc[prow+1] - pi_loc[prow];\
918       pjj  = pj_loc + pi_loc[prow];\
919       /* add non-zero cols of P into the sorted linked list lnk */\
920       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);\
921       cnzi += nlnk;\
922     }\
923     /* rows of P_oth */\
924     ptaj = ptasparserow_oth;\
925     for (j=0; j<nprow_oth; j++) {\
926       prow = *ptaj++; \
927       if (prow >= prend) prow -= pm; /* rm */\
928       pnzj = pi_oth[prow+1] - pi_oth[prow];\
929       pjj  = pj_oth + pi_oth[prow];\
930       /* add non-zero cols of P into the sorted linked list lnk */\
931       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);\
932       cnzi += nlnk;\
933     }\
934    \
935     /* If free space is not available, make more free space */\
936     /* Double the amount of total space in the list */\
937     if (current_space->local_remaining<cnzi) {\
938       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);\
939     }\
940 \
941     /* Copy data into free space, and zero out denserows */\
942     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);\
943     current_space->array           += cnzi;\
944     current_space->local_used      += cnzi;\
945     current_space->local_remaining -= cnzi;\
946    \
947     for (j=0;j<nprow_loc; j++) {\
948       ptadenserow_loc[ptasparserow_loc[j]] = 0;\
949     }\
950     for (j=0;j<nprow_oth; j++) {\
951       ptadenserow_oth[ptasparserow_oth[j]] = 0;\
952     }\
953     \
954     /* Aside: Perhaps we should save the pta info for the numerical factorization. */\
955     /*        For now, we will recompute what is needed. */ \
956     ci[i+1] = ci[i] + cnzi;\
957   }\
958   /* nnz is now stored in ci[ptm], column indices are in the list of free space */\
959   /* Allocate space for cj, initialize cj, and */\
960   /* destroy list of free space and other temporary array(s) */\
961   ierr = PetscMalloc((ci[ptM]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);\
962   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);\
963 }
964 #undef __FUNCT__
965 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
966 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
967 {
968   PetscErrorCode       ierr;
969   Mat                  P_loc,P_oth;
970   Mat_MatMatMultMPI    *ptap;
971   PetscObjectContainer container;
972   FreeSpaceList        free_space=PETSC_NULL,current_space;
973   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
974   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,
975                        *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
976   Mat_SeqAIJ           *p_loc,*p_oth;
977   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*ajj,*pjj,*rmap=p->garray;
978   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
979   PetscInt             pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji,*ptaj;
980   PetscInt             aN=A->N,am=A->m,pN=P->N;
981   PetscInt             i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi,row;
982   PetscBT              lnkbt;
983   PetscInt             prstart,prend,nprow_loc,nprow_oth;
984   PetscInt             *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
985 
986   MPI_Comm             comm=A->comm;
987   Mat                  B_mpi;
988   PetscMPIInt          size,rank,tagi,tagj,*len_s,*len_si,*len_ri;
989   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
990   PetscInt             len,proc,*dnz,*onz,*owners;
991   PetscInt             anzi,*bi,*bj,bnzi,nspacedouble=0;
992   PetscInt             nrows,tnrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
993   MPI_Request          *si_waits,*sj_waits,*ri_waits,*rj_waits;
994   MPI_Status           *status;
995   Mat_Merge_SeqsToMPI  *merge;
996   PetscInt             *ptdi,*ptdj,*cdi,*cdj;
997   PetscInt             *ptoi,*ptoj,*coi,*coj;
998 
999   PetscFunctionBegin;
1000   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
1001   if (container) {
1002     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
1003   } else {
1004     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
1005   }
1006 
1007   /* determine row ownership of C */
1008   /*---------------------------------------------------------*/
1009   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1010   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1011   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
1012 
1013   ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1014   ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr);
1015   ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr);
1016   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
1017 
1018   /* get data from P_loc and P_oth */
1019   P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart;
1020   p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
1021   pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
1022   prend = prstart + pm;
1023 
1024   /* compute symbolic C_seq = P_loc^T * A_loc * P_seq */
1025   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
1026   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
1027   ptasparserow_loc = ptadenserow_loc + aN;
1028   ptadenserow_oth  = ptasparserow_loc + aN;
1029   ptasparserow_oth = ptadenserow_oth + aN;
1030 
1031   /* create and initialize a linked list */
1032   nlnk = pN+1;
1033   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1034 
1035   /* Pt = P_loc^T */
1036   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1037   ierr = MatPtAPSymbolicLocal_Private(pN,pti,ptj,ci,cj);CHKERRQ(ierr);
1038 
1039   /* Pt = Pd^T - diagonal portion of P_loc */
1040   ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&ptdi,&ptdj);CHKERRQ(ierr);
1041   ierr = MatPtAPSymbolicLocal_Private(pn,ptdi,ptdj,cdi,cdj);CHKERRQ(ierr);
1042   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&ptdi,&ptdj);CHKERRQ(ierr);
1043 #ifdef NEW
1044   /* Pt = Po^T - off-diagonal portion of P_loc */
1045   ierr = MatGetSymbolicTranspose_SeqAIJ(p->B,&ptoi,&ptoj);CHKERRQ(ierr);
1046   ierr = MatPtAPSymbolicLocal_Private(tnrows,ptoi,ptoj,coi,coj);CHKERRQ(ierr);
1047   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&ptoi,&ptoj);CHKERRQ(ierr);
1048 #endif
1049   /* clean up */
1050   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
1051   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1052   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1053 
1054   /* determine the number of messages to send, their lengths */
1055   /*---------------------------------------------------------*/
1056   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
1057   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1058   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
1059   len_s  = merge->len_s;
1060   len = 0;  /* max length of buf_si[] */
1061   merge->nsend = 0;
1062   tnrows = (p->B)->N; /* total num of rows to be sent to other processors */
1063   proc = 0;
1064   for (i=0; i<tnrows; i++){
1065     while (rmap[i] >= owners[proc+1]) proc++;
1066     len_si[proc]++;
1067   }
1068   for (proc=0; proc<size; proc++){
1069     len_s[proc] = 0;
1070     if (len_si[proc]){
1071       merge->nsend++;
1072       len_si[proc] = 2*(len_si[proc] + 1);
1073       len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of col indices to be sent to [proc] */
1074       len += len_si[proc];
1075     }
1076   }
1077 
1078   /* determine the number and length of messages to receive for ij-structure */
1079   /*-------------------------------------------------------------------------*/
1080   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1081   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1082 
1083   /* post the Irecv of j-structure */
1084   /*-------------------------------*/
1085   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr);
1086   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr);
1087 
1088   /* post the Isend of j-structure */
1089   /*--------------------------------*/
1090   ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr);
1091   sj_waits = si_waits + merge->nsend;
1092 
1093   for (proc=0, k=0; proc<size; proc++){
1094     if (!len_s[proc]) continue;
1095     i = owners[proc];
1096     ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr);
1097     k++;
1098   }
1099 
1100   /* receives and sends of j-structure are complete */
1101   /*------------------------------------------------*/
1102   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
1103   ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr);
1104   ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr);
1105 
1106   /* send and recv i-structure */
1107   /*---------------------------*/
1108   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr);
1109   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr);
1110 
1111   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
1112   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1113 
1114   j = 0; /* row index to be sent */
1115   for (proc=0,k=0; proc<size; proc++){
1116     if (!len_s[proc]) continue;
1117     /* form outgoing message for i-structure:
1118          buf_si[0]:                 nrows to be sent
1119                [1:nrows]:           row index (global)
1120                [nrows+1:2*nrows+1]: i-structure index
1121     */
1122     /*-------------------------------------------*/
1123     nrows = len_si[proc]/2 - 1;
1124     buf_si_i    = buf_si + nrows+1;
1125     buf_si[0]   = nrows;
1126     buf_si_i[0] = 0;
1127     for (i=0; i<nrows; i++){
1128       row = rmap[j++];
1129       anzi = ci[row+1] - ci[row];
1130       buf_si_i[i+1] = buf_si_i[i] + anzi; /* i-structure */
1131       buf_si[i+1]   = row - owners[proc]; /* local row index */
1132     }
1133     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr);
1134     k++;
1135     buf_si += len_si[proc];
1136   }
1137 
1138   ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr);
1139   ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr);
1140 
1141   ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr);
1142   for (i=0; i<merge->nrecv; i++){
1143     ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI:   recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr);
1144   }
1145 
1146   ierr = PetscFree(len_si);CHKERRQ(ierr);
1147   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1148   ierr = PetscFree(rj_waits);CHKERRQ(ierr);
1149   ierr = PetscFree(si_waits);CHKERRQ(ierr);
1150   ierr = PetscFree(ri_waits);CHKERRQ(ierr);
1151   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1152   ierr = PetscFree(status);CHKERRQ(ierr);
1153 
1154   /* compute a local seq matrix in each processor */
1155   /*----------------------------------------------*/
1156   /* allocate bi array and free space for accumulating nonzero column info */
1157   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
1158   bi[0] = 0;
1159 
1160   /* create and initialize a linked list */
1161   nlnk = pN+1;
1162   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1163 
1164   /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */
1165   len = 0;
1166   len  = ci[owners[rank+1]] - ci[owners[rank]];
1167   free_space=PETSC_NULL;
1168   ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr);
1169   current_space = free_space;
1170 
1171   /* determine symbolic info for each local row of C */
1172   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
1173   nextrow = buf_ri_k + merge->nrecv;
1174   nextci  = nextrow + merge->nrecv;
1175   for (k=0; k<merge->nrecv; k++){
1176     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1177     nrows = *buf_ri_k[k];
1178     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1179     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
1180   }
1181 
1182   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
1183   len = 0;
1184   for (i=0; i<pn; i++) {
1185     bnzi = 0;
1186     /* add local non-zero cols of this proc's C_seq into lnk */
1187     anzi = cdi[i+1] - cdi[i];
1188     cji  = cdj + cdi[i];
1189     ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1190     bnzi += nlnk;
1191     /* add received col data into lnk */
1192     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
1193       if (i == *nextrow[k]) { /* i-th row */
1194         anzi = *(nextci[k]+1) - *nextci[k];
1195         cji  = buf_rj[k] + *nextci[k];
1196         ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1197         bnzi += nlnk;
1198         nextrow[k]++; nextci[k]++;
1199       }
1200     }
1201     if (len < bnzi) len = bnzi;  /* =max(bnzi) */
1202 
1203     /* if free space is not available, make more free space */
1204     if (current_space->local_remaining<bnzi) {
1205       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1206       nspacedouble++;
1207     }
1208     /* copy data into free space, then initialize lnk */
1209     ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1210     ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr);
1211 
1212     current_space->array           += bnzi;
1213     current_space->local_used      += bnzi;
1214     current_space->local_remaining -= bnzi;
1215 
1216     bi[i+1] = bi[i] + bnzi;
1217   }
1218 
1219   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
1220 
1221   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
1222   ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1223   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1224 
1225   /* create symbolic parallel matrix B_mpi */
1226   /*---------------------------------------*/
1227   ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr);
1228   ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr);
1229   ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr);
1230   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1231 
1232   /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */
1233   B_mpi->assembled     = PETSC_FALSE;
1234   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_SeqsToMPI;
1235   merge->bi            = bi;
1236   merge->bj            = bj;
1237   merge->ci            = ci;
1238   merge->cj            = cj;
1239   merge->buf_ri        = buf_ri;
1240   merge->buf_rj        = buf_rj;
1241   merge->C_seq         = PETSC_NULL;
1242 
1243   /* attach the supporting struct to B_mpi for reuse */
1244   ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
1245   ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr);
1246   ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr);
1247   *C = B_mpi;
1248 
1249   ierr = PetscFree(cdi);CHKERRQ(ierr);
1250   ierr = PetscFree(cdj);CHKERRQ(ierr);
1251 #ifdef NEW
1252   ierr = PetscFree(coi);CHKERRQ(ierr);
1253   ierr = PetscFree(coj);CHKERRQ(ierr);
1254 #endif
1255   PetscFunctionReturn(0);
1256 }
1257 
1258 #undef __FUNCT__
1259 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
1260 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
1261 {
1262   PetscErrorCode       ierr;
1263   Mat_Merge_SeqsToMPI  *merge;
1264   Mat_MatMatMultMPI    *ptap;
1265   PetscObjectContainer cont_merge,cont_ptap;
1266   PetscInt             flops=0;
1267   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1268   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1269   Mat_SeqAIJ           *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data,*p_oth;
1270   Mat_SeqAIJ           *cd=(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
1271   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense;
1272   PetscInt             *pi_oth,*pj_oth,*pJ_d=pd->j,*pJ_o=po->j,*pjj;
1273   PetscInt             i,j,k,nzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1274   MatScalar            *ada=ad->a,*aoa=ao->a,*apa,*paj,*cseqa,*caj,**abuf_r,*ba_i,*ca;
1275   MatScalar            *pA_d=pd->a,*pA_o=po->a,*pa_oth;
1276   PetscInt             am=A->m,cN=C->N,cm=C->m;
1277   MPI_Comm             comm=C->comm;
1278   PetscMPIInt          size,rank,taga,*len_s,**buf_ri,**buf_rj,**buf_ri_k,**nextrow,**nextcseqi;
1279   PetscInt             *owners,proc,nrows;
1280   PetscInt             *cjj,cseqnzi,*bj_i,*bi,*bj,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */
1281   MPI_Request          *s_waits,*r_waits;
1282   MPI_Status           *status;
1283   PetscInt             *cseqi,*cseqj,col;
1284   PetscInt             stages[3];
1285 
1286   PetscFunctionBegin;
1287   ierr = PetscLogStageRegister(&stages[0],"NumAP_local");CHKERRQ(ierr);
1288   ierr = PetscLogStageRegister(&stages[1],"NumPtAP_local");CHKERRQ(ierr);
1289   ierr = PetscLogStageRegister(&stages[2],"NumPtAP_Comm");CHKERRQ(ierr);
1290 
1291   ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr);
1292   if (cont_merge) {
1293     ierr  = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr);
1294   } else {
1295     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
1296   }
1297   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr);
1298   if (cont_ptap) {
1299     ierr  = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr);
1300   } else {
1301     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
1302   }
1303   /* get data from symbolic products */
1304   p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data;
1305   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
1306 
1307   cseqi = merge->ci; cseqj=merge->cj;
1308   ierr  = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr);
1309   ierr  = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
1310 
1311   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1312   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1313   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
1314 
1315   /* Allocate temporary array for storage of one row of A*P and one row of C */
1316   ierr = PetscMalloc(2*cN*(sizeof(MatScalar)+sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1317   ierr = PetscMemzero(apa,2*cN*(sizeof(MatScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
1318   ca       = (MatScalar*)(apa + cN);
1319   apj      = (PetscInt *)(ca + cN);
1320   apjdense = (PetscInt *)(apj + cN);
1321 
1322   /* Clear old values in C */
1323   ierr = PetscMemzero(cd->a,cd->i[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1324   ierr = PetscMemzero(co->a,co->i[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1325 
1326   for (i=0;i<am;i++) {
1327     ierr = PetscLogStagePush(stages[0]);CHKERRQ(ierr);
1328     /* Form i-th sparse row of A*P */
1329      apnzj = 0;
1330     /* diagonal portion of A */
1331     nzi  = adi[i+1] - adi[i];
1332     for (j=0;j<nzi;j++) {
1333       prow = *adj++;
1334       /* diagonal portion of P */
1335       pnzj = pd->i[prow+1] - pd->i[prow];
1336       pjj  = pd->j + pd->i[prow]; /* local col index of P */
1337       paj  = pd->a + pd->i[prow];
1338       for (k=0;k<pnzj;k++) {
1339         col = *pjj + p->cstart; pjj++; /* global col index of P */
1340         if (!apjdense[col]) {
1341           apjdense[col] = -1;
1342           apj[apnzj++]  = col;
1343         }
1344         apa[col] += (*ada)*paj[k];
1345       }
1346       flops += 2*pnzj;
1347       /* off-diagonal portion of P */
1348       pnzj = po->i[prow+1] - po->i[prow];
1349       pjj  = po->j + po->i[prow]; /* local col index of P */
1350       paj  = po->a + po->i[prow];
1351       for (k=0;k<pnzj;k++) {
1352         col = p->garray[*pjj]; pjj++; /* global col index of P */
1353         if (!apjdense[col]) {
1354           apjdense[col] = -1;
1355           apj[apnzj++]  = col;
1356         }
1357         apa[col] += (*ada)*paj[k];
1358       }
1359       flops += 2*pnzj;
1360 
1361       ada++;
1362     }
1363     /* off-diagonal portion of A */
1364     nzi  = aoi[i+1] - aoi[i];
1365     for (j=0;j<nzi;j++) {
1366       prow = *aoj++;
1367       pnzj = pi_oth[prow+1] - pi_oth[prow];
1368       pjj  = pj_oth + pi_oth[prow];
1369       paj  = pa_oth + pi_oth[prow];
1370       for (k=0;k<pnzj;k++) {
1371         if (!apjdense[pjj[k]]) {
1372           apjdense[pjj[k]] = -1;
1373           apj[apnzj++]     = pjj[k];
1374         }
1375         apa[pjj[k]] += (*aoa)*paj[k];
1376       }
1377       flops += 2*pnzj;
1378       aoa++;
1379     }
1380     /* Sort the j index array for quick sparse axpy. */
1381     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1382 
1383     ierr = PetscLogStagePop();
1384     ierr = PetscLogStagePush(stages[1]);
1385     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
1386     /* diagonal portion of P -- gives matrix value of local C */
1387     pnzi = pd->i[i+1] - pd->i[i];
1388     for (j=0;j<pnzi;j++) {
1389       /* add value into C */
1390       for (k=0; k<apnzj; k++) ca[k] = (*pA_d)*apa[apj[k]];
1391       crow = (*pJ_d++) + owners[rank];
1392       ierr = MatSetValues(C,1,&crow,apnzj,apj,ca,ADD_VALUES);CHKERRQ(ierr);
1393       ierr = PetscMemzero(ca,apnzj*(sizeof(MatScalar)));CHKERRQ(ierr);
1394       pA_d++;
1395       flops += 2*apnzj;
1396     }
1397 
1398     /* off-diagonal portion of P -- gives matrix values to be sent to others */
1399     pnzi = po->i[i+1] - po->i[i];
1400     for (j=0;j<pnzi;j++) {
1401       crow   = p->garray[*pJ_o++];
1402       cjj    = cseqj + cseqi[crow];
1403       caj    = cseqa + cseqi[crow];
1404       /* add value into C_seq to be sent to other processors */
1405       nextap = 0;
1406       for (k=0;nextap<apnzj;k++) {
1407         if (cjj[k]==apj[nextap]) {
1408           caj[k] += (*pA_o)*apa[apj[nextap++]];
1409         }
1410       }
1411       flops += 2*apnzj;
1412       pA_o++;
1413     }
1414 
1415     /* Zero the current row info for A*P */
1416     for (j=0;j<apnzj;j++) {
1417       apa[apj[j]]      = 0.;
1418       apjdense[apj[j]] = 0;
1419     }
1420     ierr = PetscLogStagePop();
1421   }
1422 
1423   /* send and recv matrix values */
1424   /*-----------------------------*/
1425   ierr = PetscLogStagePush(stages[2]);CHKERRQ(ierr);
1426   bi     = merge->bi;
1427   bj     = merge->bj;
1428   buf_ri = merge->buf_ri;
1429   buf_rj = merge->buf_rj;
1430   len_s  = merge->len_s;
1431   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr);
1432   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1433 
1434   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr);
1435   for (proc=0,k=0; proc<size; proc++){
1436     if (!len_s[proc]) continue;
1437     i = owners[proc];
1438     ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1439     k++;
1440   }
1441   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
1442   ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);
1443   ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);
1444   ierr = PetscFree(status);CHKERRQ(ierr);
1445 
1446   ierr = PetscFree(s_waits);CHKERRQ(ierr);
1447   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1448   ierr = PetscFree(cseqa);CHKERRQ(ierr);
1449 
1450   /* insert mat values of mpimat */
1451   /*----------------------------*/
1452   ba_i = apa; /* rename, temporary array for storage of one row of C */
1453   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
1454   nextrow = buf_ri_k + merge->nrecv;
1455   nextcseqi = nextrow + merge->nrecv;
1456 
1457   for (k=0; k<merge->nrecv; k++){
1458     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1459     nrows        = *(buf_ri_k[k]);
1460     nextrow[k]   = buf_ri_k[k]+1;  /* next row index of k-th recved i-structure */
1461     nextcseqi[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
1462   }
1463 
1464   /* add received local vals to C */
1465   for (i=0; i<cm; i++) {
1466     crow = owners[rank] + i;
1467     bj_i = bj+bi[i];  /* col indices of the i-th row of C */
1468     bnzi = bi[i+1] - bi[i];
1469     nzi = 0;
1470     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
1471       /* i-th row */
1472       if (i == *nextrow[k]) {
1473         cseqnzi = *(nextcseqi[k]+1) - *nextcseqi[k];
1474         cseqj   = buf_rj[k] + *(nextcseqi[k]);
1475         cseqa   = abuf_r[k] + *(nextcseqi[k]);
1476         nextcseqj = 0;
1477         for (j=0; nextcseqj<cseqnzi; j++){
1478           if (*(bj_i + j) == cseqj[nextcseqj]){ /* bcol == cseqcol */
1479             ba_i[j] += cseqa[nextcseqj++];
1480             nzi++;
1481           }
1482         }
1483         nextrow[k]++; nextcseqi[k]++;
1484       }
1485     }
1486     if (nzi>0){
1487       ierr = MatSetValues(C,1,&crow,bnzi,bj_i,ba_i,ADD_VALUES);CHKERRQ(ierr);
1488       ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr);
1489       flops += 2*nzi;
1490     }
1491   }
1492   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1493   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1494   ierr = PetscLogStagePop();CHKERRQ(ierr);
1495 
1496   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1497   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
1498   ierr = PetscFree(apa);CHKERRQ(ierr);
1499   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1500 
1501   PetscFunctionReturn(0);
1502 }
1503 #endif /* TOBEREMOVED */
1504 
1505 #undef __FUNCT__
1506 #define __FUNCT__ "MatPtAP_SeqAIJ_SeqAIJ"
1507 PetscErrorCode MatPtAP_SeqAIJ_SeqAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
1508 {
1509   PetscErrorCode ierr;
1510 
1511   PetscFunctionBegin;
1512   if (scall == MAT_INITIAL_MATRIX){
1513     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
1514     ierr = MatPtAPSymbolic_SeqAIJ_SeqAIJ(A,P,fill,C);CHKERRQ(ierr);
1515     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
1516   }
1517   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1518   ierr = MatPtAPNumeric_SeqAIJ_SeqAIJ(A,P,*C);CHKERRQ(ierr);
1519   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1520   PetscFunctionReturn(0);
1521 }
1522 
1523 /*
1524    MatPtAPSymbolic - Creates the (i,j) structure of the matrix projection C = P^T * A * P
1525 
1526    Collective on Mat
1527 
1528    Input Parameters:
1529 +  A - the matrix
1530 -  P - the projection matrix
1531 
1532    Output Parameters:
1533 .  C - the (i,j) structure of the product matrix
1534 
1535    Notes:
1536    C will be created and must be destroyed by the user with MatDestroy().
1537 
1538    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
1539    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.  The product is computed using
1540    this (i,j) structure by calling MatPtAPNumeric().
1541 
1542    Level: intermediate
1543 
1544 .seealso: MatPtAP(),MatPtAPNumeric(),MatMatMultSymbolic()
1545 */
1546 #undef __FUNCT__
1547 #define __FUNCT__ "MatPtAPSymbolic"
1548 PetscErrorCode MatPtAPSymbolic(Mat A,Mat P,PetscReal fill,Mat *C)
1549 {
1550   PetscErrorCode ierr;
1551   PetscErrorCode (*fA)(Mat,Mat,PetscReal,Mat*);
1552   PetscErrorCode (*fP)(Mat,Mat,PetscReal,Mat*);
1553 
1554   PetscFunctionBegin;
1555 
1556   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
1557   PetscValidType(A,1);
1558   MatPreallocated(A);
1559   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1560   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1561 
1562   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
1563   PetscValidType(P,2);
1564   MatPreallocated(P);
1565   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1566   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1567 
1568   PetscValidPointer(C,3);
1569 
1570   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
1571   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
1572 
1573   /* For now, we do not dispatch based on the type of A and P */
1574   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
1575   fA = A->ops->ptapsymbolic;
1576   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for A of type %s",A->type_name);
1577   fP = P->ops->ptapsymbolic;
1578   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for P of type %s",P->type_name);
1579   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPSymbolic requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
1580 
1581   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
1582   ierr = (*fA)(A,P,fill,C);CHKERRQ(ierr);
1583   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
1584 
1585   PetscFunctionReturn(0);
1586 }
1587 
1588 typedef struct {
1589   Mat    symAP;
1590 } Mat_PtAPstruct;
1591 
1592 EXTERN PetscErrorCode MatDestroy_SeqAIJ(Mat);
1593 
1594 #undef __FUNCT__
1595 #define __FUNCT__ "MatDestroy_SeqAIJ_PtAP"
1596 PetscErrorCode MatDestroy_SeqAIJ_PtAP(Mat A)
1597 {
1598   PetscErrorCode    ierr;
1599   Mat_PtAPstruct    *ptap=(Mat_PtAPstruct*)A->spptr;
1600 
1601   PetscFunctionBegin;
1602   ierr = MatDestroy(ptap->symAP);CHKERRQ(ierr);
1603   ierr = PetscFree(ptap);CHKERRQ(ierr);
1604   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1605   PetscFunctionReturn(0);
1606 }
1607 
1608 #undef __FUNCT__
1609 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqAIJ"
1610 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
1611 {
1612   PetscErrorCode ierr;
1613   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1614   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*p = (Mat_SeqAIJ*)P->data,*c;
1615   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
1616   PetscInt       *ci,*cj,*ptadenserow,*ptasparserow,*ptaj;
1617   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M;
1618   PetscInt       i,j,k,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi,nlnk,*lnk;
1619   MatScalar      *ca;
1620   PetscBT        lnkbt;
1621 
1622   PetscFunctionBegin;
1623   /* Get ij structure of P^T */
1624   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1625   ptJ=ptj;
1626 
1627   /* Allocate ci array, arrays for fill computation and */
1628   /* free space for accumulating nonzero column info */
1629   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1630   ci[0] = 0;
1631 
1632   ierr = PetscMalloc((2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
1633   ierr = PetscMemzero(ptadenserow,(2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
1634   ptasparserow = ptadenserow  + an;
1635 
1636   /* create and initialize a linked list */
1637   nlnk = pn+1;
1638   ierr = PetscLLCreate(pn,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1639 
1640   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
1641   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1642   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
1643   current_space = free_space;
1644 
1645   /* Determine symbolic info for each row of C: */
1646   for (i=0;i<pn;i++) {
1647     ptnzi  = pti[i+1] - pti[i];
1648     ptanzi = 0;
1649     /* Determine symbolic row of PtA: */
1650     for (j=0;j<ptnzi;j++) {
1651       arow = *ptJ++;
1652       anzj = ai[arow+1] - ai[arow];
1653       ajj  = aj + ai[arow];
1654       for (k=0;k<anzj;k++) {
1655         if (!ptadenserow[ajj[k]]) {
1656           ptadenserow[ajj[k]]    = -1;
1657           ptasparserow[ptanzi++] = ajj[k];
1658         }
1659       }
1660     }
1661       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
1662     ptaj = ptasparserow;
1663     cnzi   = 0;
1664     for (j=0;j<ptanzi;j++) {
1665       prow = *ptaj++;
1666       pnzj = pi[prow+1] - pi[prow];
1667       pjj  = pj + pi[prow];
1668       /* add non-zero cols of P into the sorted linked list lnk */
1669       ierr = PetscLLAdd(pnzj,pjj,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1670       cnzi += nlnk;
1671     }
1672 
1673     /* If free space is not available, make more free space */
1674     /* Double the amount of total space in the list */
1675     if (current_space->local_remaining<cnzi) {
1676       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1677     }
1678 
1679     /* Copy data into free space, and zero out denserows */
1680     ierr = PetscLLClean(pn,pn,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1681     current_space->array           += cnzi;
1682     current_space->local_used      += cnzi;
1683     current_space->local_remaining -= cnzi;
1684 
1685     for (j=0;j<ptanzi;j++) {
1686       ptadenserow[ptasparserow[j]] = 0;
1687     }
1688     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1689     /*        For now, we will recompute what is needed. */
1690     ci[i+1] = ci[i] + cnzi;
1691   }
1692   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1693   /* Allocate space for cj, initialize cj, and */
1694   /* destroy list of free space and other temporary array(s) */
1695   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1696   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1697   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
1698   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1699 
1700   /* Allocate space for ca */
1701   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1702   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1703 
1704   /* put together the new matrix */
1705   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
1706 
1707   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1708   /* Since these are PETSc arrays, change flags to free them as necessary. */
1709   c = (Mat_SeqAIJ *)((*C)->data);
1710   c->freedata = PETSC_TRUE;
1711   c->nonew    = 0;
1712 
1713   /* Clean up. */
1714   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1715 
1716   PetscFunctionReturn(0);
1717 }
1718 
1719 #include "src/mat/impls/maij/maij.h"
1720 EXTERN_C_BEGIN
1721 #undef __FUNCT__
1722 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqMAIJ"
1723 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqMAIJ(Mat A,Mat PP,Mat *C)
1724 {
1725   /* This routine requires testing -- I don't think it works. */
1726   PetscErrorCode ierr;
1727   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1728   Mat_SeqMAIJ    *pp=(Mat_SeqMAIJ*)PP->data;
1729   Mat            P=pp->AIJ;
1730   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*p=(Mat_SeqAIJ*)P->data,*c;
1731   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
1732   PetscInt       *ci,*cj,*denserow,*sparserow,*ptadenserow,*ptasparserow,*ptaj;
1733   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M,ppdof=pp->dof;
1734   PetscInt       i,j,k,dof,pdof,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi;
1735   MatScalar      *ca;
1736 
1737   PetscFunctionBegin;
1738   /* Start timer */
1739   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1740 
1741   /* Get ij structure of P^T */
1742   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1743 
1744   /* Allocate ci array, arrays for fill computation and */
1745   /* free space for accumulating nonzero column info */
1746   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1747   ci[0] = 0;
1748 
1749   ierr = PetscMalloc((2*pn+2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
1750   ierr = PetscMemzero(ptadenserow,(2*pn+2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
1751   ptasparserow = ptadenserow  + an;
1752   denserow     = ptasparserow + an;
1753   sparserow    = denserow     + pn;
1754 
1755   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
1756   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1757   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
1758   current_space = free_space;
1759 
1760   /* Determine symbolic info for each row of C: */
1761   for (i=0;i<pn/ppdof;i++) {
1762     ptnzi  = pti[i+1] - pti[i];
1763     ptanzi = 0;
1764     ptJ    = ptj + pti[i];
1765     for (dof=0;dof<ppdof;dof++) {
1766     /* Determine symbolic row of PtA: */
1767       for (j=0;j<ptnzi;j++) {
1768         arow = ptJ[j] + dof;
1769         anzj = ai[arow+1] - ai[arow];
1770         ajj  = aj + ai[arow];
1771         for (k=0;k<anzj;k++) {
1772           if (!ptadenserow[ajj[k]]) {
1773             ptadenserow[ajj[k]]    = -1;
1774             ptasparserow[ptanzi++] = ajj[k];
1775           }
1776         }
1777       }
1778       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
1779       ptaj = ptasparserow;
1780       cnzi   = 0;
1781       for (j=0;j<ptanzi;j++) {
1782         pdof = *ptaj%dof;
1783         prow = (*ptaj++)/dof;
1784         pnzj = pi[prow+1] - pi[prow];
1785         pjj  = pj + pi[prow];
1786         for (k=0;k<pnzj;k++) {
1787           if (!denserow[pjj[k]+pdof]) {
1788             denserow[pjj[k]+pdof] = -1;
1789             sparserow[cnzi++]     = pjj[k]+pdof;
1790           }
1791         }
1792       }
1793 
1794       /* sort sparserow */
1795       ierr = PetscSortInt(cnzi,sparserow);CHKERRQ(ierr);
1796 
1797       /* If free space is not available, make more free space */
1798       /* Double the amount of total space in the list */
1799       if (current_space->local_remaining<cnzi) {
1800         ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1801       }
1802 
1803       /* Copy data into free space, and zero out denserows */
1804       ierr = PetscMemcpy(current_space->array,sparserow,cnzi*sizeof(PetscInt));CHKERRQ(ierr);
1805       current_space->array           += cnzi;
1806       current_space->local_used      += cnzi;
1807       current_space->local_remaining -= cnzi;
1808 
1809       for (j=0;j<ptanzi;j++) {
1810         ptadenserow[ptasparserow[j]] = 0;
1811       }
1812       for (j=0;j<cnzi;j++) {
1813         denserow[sparserow[j]] = 0;
1814       }
1815       /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1816       /*        For now, we will recompute what is needed. */
1817       ci[i+1+dof] = ci[i+dof] + cnzi;
1818     }
1819   }
1820   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1821   /* Allocate space for cj, initialize cj, and */
1822   /* destroy list of free space and other temporary array(s) */
1823   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1824   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1825   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
1826 
1827   /* Allocate space for ca */
1828   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1829   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1830 
1831   /* put together the new matrix */
1832   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
1833 
1834   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1835   /* Since these are PETSc arrays, change flags to free them as necessary. */
1836   c = (Mat_SeqAIJ *)((*C)->data);
1837   c->freedata = PETSC_TRUE;
1838   c->nonew    = 0;
1839 
1840   /* Clean up. */
1841   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1842 
1843   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1844   PetscFunctionReturn(0);
1845 }
1846 EXTERN_C_END
1847 
1848 /*
1849    MatPtAPNumeric - Computes the matrix projection C = P^T * A * P
1850 
1851    Collective on Mat
1852 
1853    Input Parameters:
1854 +  A - the matrix
1855 -  P - the projection matrix
1856 
1857    Output Parameters:
1858 .  C - the product matrix
1859 
1860    Notes:
1861    C must have been created by calling MatPtAPSymbolic and must be destroyed by
1862    the user using MatDeatroy().
1863 
1864    This routine is currently only implemented for pairs of AIJ matrices and classes
1865    which inherit from AIJ.  C will be of type MATAIJ.
1866 
1867    Level: intermediate
1868 
1869 .seealso: MatPtAP(),MatPtAPSymbolic(),MatMatMultNumeric()
1870 */
1871 #undef __FUNCT__
1872 #define __FUNCT__ "MatPtAPNumeric"
1873 PetscErrorCode MatPtAPNumeric(Mat A,Mat P,Mat C)
1874 {
1875   PetscErrorCode ierr;
1876   PetscErrorCode (*fA)(Mat,Mat,Mat);
1877   PetscErrorCode (*fP)(Mat,Mat,Mat);
1878 
1879   PetscFunctionBegin;
1880 
1881   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
1882   PetscValidType(A,1);
1883   MatPreallocated(A);
1884   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1885   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1886 
1887   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
1888   PetscValidType(P,2);
1889   MatPreallocated(P);
1890   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1891   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1892 
1893   PetscValidHeaderSpecific(C,MAT_COOKIE,3);
1894   PetscValidType(C,3);
1895   MatPreallocated(C);
1896   if (!C->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1897   if (C->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1898 
1899   if (P->N!=C->M) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->M);
1900   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
1901   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
1902   if (P->N!=C->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->N);
1903 
1904   /* For now, we do not dispatch based on the type of A and P */
1905   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
1906   fA = A->ops->ptapnumeric;
1907   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for A of type %s",A->type_name);
1908   fP = P->ops->ptapnumeric;
1909   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for P of type %s",P->type_name);
1910   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPNumeric requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
1911 
1912   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1913   ierr = (*fA)(A,P,C);CHKERRQ(ierr);
1914   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1915 
1916   PetscFunctionReturn(0);
1917 }
1918 
1919 #undef __FUNCT__
1920 #define __FUNCT__ "MatPtAPNumeric_SeqAIJ_SeqAIJ"
1921 PetscErrorCode MatPtAPNumeric_SeqAIJ_SeqAIJ(Mat A,Mat P,Mat C)
1922 {
1923   PetscErrorCode ierr;
1924   PetscInt       flops=0;
1925   Mat_SeqAIJ     *a  = (Mat_SeqAIJ *) A->data;
1926   Mat_SeqAIJ     *p  = (Mat_SeqAIJ *) P->data;
1927   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
1928   PetscInt       *ai=a->i,*aj=a->j,*apj,*apjdense,*pi=p->i,*pj=p->j,*pJ=p->j,*pjj;
1929   PetscInt       *ci=c->i,*cj=c->j,*cjj;
1930   PetscInt       am=A->M,cn=C->N,cm=C->M;
1931   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1932   MatScalar      *aa=a->a,*apa,*pa=p->a,*pA=p->a,*paj,*ca=c->a,*caj;
1933 
1934   PetscFunctionBegin;
1935   /* Allocate temporary array for storage of one row of A*P */
1936   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1937   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
1938 
1939   apj      = (PetscInt *)(apa + cn);
1940   apjdense = apj + cn;
1941 
1942   /* Clear old values in C */
1943   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1944 
1945   for (i=0;i<am;i++) {
1946     /* Form sparse row of A*P */
1947     anzi  = ai[i+1] - ai[i];
1948     apnzj = 0;
1949     for (j=0;j<anzi;j++) {
1950       prow = *aj++;
1951       pnzj = pi[prow+1] - pi[prow];
1952       pjj  = pj + pi[prow];
1953       paj  = pa + pi[prow];
1954       for (k=0;k<pnzj;k++) {
1955         if (!apjdense[pjj[k]]) {
1956           apjdense[pjj[k]] = -1;
1957           apj[apnzj++]     = pjj[k];
1958         }
1959         apa[pjj[k]] += (*aa)*paj[k];
1960       }
1961       flops += 2*pnzj;
1962       aa++;
1963     }
1964 
1965     /* Sort the j index array for quick sparse axpy. */
1966     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1967 
1968     /* Compute P^T*A*P using outer product (P^T)[:,j]*(A*P)[j,:]. */
1969     pnzi = pi[i+1] - pi[i];
1970     for (j=0;j<pnzi;j++) {
1971       nextap = 0;
1972       crow   = *pJ++;
1973       cjj    = cj + ci[crow];
1974       caj    = ca + ci[crow];
1975       /* Perform sparse axpy operation.  Note cjj includes apj. */
1976       for (k=0;nextap<apnzj;k++) {
1977         if (cjj[k]==apj[nextap]) {
1978           caj[k] += (*pA)*apa[apj[nextap++]];
1979         }
1980       }
1981       flops += 2*apnzj;
1982       pA++;
1983     }
1984 
1985     /* Zero the current row info for A*P */
1986     for (j=0;j<apnzj;j++) {
1987       apa[apj[j]]      = 0.;
1988       apjdense[apj[j]] = 0;
1989     }
1990   }
1991 
1992   /* Assemble the final matrix and clean up */
1993   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1994   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1995   ierr = PetscFree(apa);CHKERRQ(ierr);
1996   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1997 
1998   PetscFunctionReturn(0);
1999 }
2000 
2001 /* Compute local C = P_loc^T * A * P - used by MatPtAP_MPIAIJ_MPIAIJ() */
2002 static PetscEvent logkey_matptapnumeric_local = 0;
2003 #undef __FUNCT__
2004 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ_Local"
2005 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,Mat C)
2006 {
2007   PetscErrorCode ierr;
2008   PetscInt       flops=0;
2009   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
2010   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
2011   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
2012   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
2013   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
2014   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col;
2015   PetscInt       *pJ=pj_loc,*pjj;
2016   PetscInt       *ci=c->i,*cj=c->j,*cjj;
2017   PetscInt       am=A->m,cn=C->N,cm=C->M;
2018   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
2019   MatScalar      *ada=ad->a,*aoa=ao->a,*apa,*paj,*ca=c->a,*caj;
2020   MatScalar      *pa_loc=p_loc->a,*pA=pa_loc,*pa_oth=p_oth->a;
2021 
2022   PetscFunctionBegin;
2023   if (!logkey_matptapnumeric_local) {
2024     ierr = PetscLogEventRegister(&logkey_matptapnumeric_local,"MatPtAPNumeric_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);CHKERRQ(ierr);
2025   }
2026   ierr = PetscLogEventBegin(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
2027 
2028   /* Allocate temporary array for storage of one row of A*P */
2029   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
2030   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
2031   apj      = (PetscInt *)(apa + cn);
2032   apjdense = apj + cn;
2033 
2034   /* Clear old values in C */
2035   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
2036 
2037   for (i=0;i<am;i++) {
2038     /* Form i-th sparse row of A*P */
2039      apnzj = 0;
2040     /* diagonal portion of A */
2041     anzi  = adi[i+1] - adi[i];
2042     for (j=0;j<anzi;j++) {
2043       prow = *adj;
2044       adj++;
2045       pnzj = pi_loc[prow+1] - pi_loc[prow];
2046       pjj  = pj_loc + pi_loc[prow];
2047       paj  = pa_loc + pi_loc[prow];
2048       for (k=0;k<pnzj;k++) {
2049         if (!apjdense[pjj[k]]) {
2050           apjdense[pjj[k]] = -1;
2051           apj[apnzj++]     = pjj[k];
2052         }
2053         apa[pjj[k]] += (*ada)*paj[k];
2054       }
2055       flops += 2*pnzj;
2056       ada++;
2057     }
2058     /* off-diagonal portion of A */
2059     anzi  = aoi[i+1] - aoi[i];
2060     for (j=0;j<anzi;j++) {
2061       col = a->garray[*aoj];
2062       if (col < cstart){
2063         prow = *aoj;
2064       } else if (col >= cend){
2065         prow = *aoj;
2066       } else {
2067         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
2068       }
2069       aoj++;
2070       pnzj = pi_oth[prow+1] - pi_oth[prow];
2071       pjj  = pj_oth + pi_oth[prow];
2072       paj  = pa_oth + pi_oth[prow];
2073       for (k=0;k<pnzj;k++) {
2074         if (!apjdense[pjj[k]]) {
2075           apjdense[pjj[k]] = -1;
2076           apj[apnzj++]     = pjj[k];
2077         }
2078         apa[pjj[k]] += (*aoa)*paj[k];
2079       }
2080       flops += 2*pnzj;
2081       aoa++;
2082     }
2083     /* Sort the j index array for quick sparse axpy. */
2084     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
2085 
2086     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
2087     pnzi = pi_loc[i+1] - pi_loc[i];
2088     for (j=0;j<pnzi;j++) {
2089       nextap = 0;
2090       crow   = *pJ++;
2091       cjj    = cj + ci[crow];
2092       caj    = ca + ci[crow];
2093       /* Perform sparse axpy operation.  Note cjj includes apj. */
2094       for (k=0;nextap<apnzj;k++) {
2095         if (cjj[k]==apj[nextap]) {
2096           caj[k] += (*pA)*apa[apj[nextap++]];
2097         }
2098       }
2099       flops += 2*apnzj;
2100       pA++;
2101     }
2102 
2103     /* Zero the current row info for A*P */
2104     for (j=0;j<apnzj;j++) {
2105       apa[apj[j]]      = 0.;
2106       apjdense[apj[j]] = 0;
2107     }
2108   }
2109 
2110   /* Assemble the final matrix and clean up */
2111   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2112   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2113   ierr = PetscFree(apa);CHKERRQ(ierr);
2114   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
2115   ierr = PetscLogEventEnd(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
2116   PetscFunctionReturn(0);
2117 }
2118 static PetscEvent logkey_matptapsymbolic_local = 0;
2119 #undef __FUNCT__
2120 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local"
2121 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,PetscReal fill,Mat *C)
2122 {
2123   PetscErrorCode ierr;
2124   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
2125   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
2126   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*c;
2127   PetscInt       *pti,*ptj,*ptJ,*ajj,*pjj;
2128   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
2129   PetscInt       *ci,*cj,*ptaj;
2130   PetscInt       aN=A->N,am=A->m,pN=P_loc->N;
2131   PetscInt       i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi;
2132   PetscInt       pm=P_loc->m,nlnk,*lnk;
2133   MatScalar      *ca;
2134   PetscBT        lnkbt;
2135   PetscInt       prend,nprow_loc,nprow_oth;
2136   PetscInt       *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
2137   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
2138   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
2139 
2140   PetscFunctionBegin;
2141   if (!logkey_matptapsymbolic_local) {
2142     ierr = PetscLogEventRegister(&logkey_matptapsymbolic_local,"MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);
2143   }
2144   ierr = PetscLogEventBegin(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
2145 
2146   prend = prstart + pm;
2147 
2148   /* get ij structure of P_loc^T */
2149   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
2150   ptJ=ptj;
2151 
2152   /* Allocate ci array, arrays for fill computation and */
2153   /* free space for accumulating nonzero column info */
2154   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
2155   ci[0] = 0;
2156 
2157   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
2158   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
2159   ptasparserow_loc = ptadenserow_loc + aN;
2160   ptadenserow_oth  = ptasparserow_loc + aN;
2161   ptasparserow_oth = ptadenserow_oth + aN;
2162 
2163   /* create and initialize a linked list */
2164   nlnk = pN+1;
2165   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
2166 
2167   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
2168   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
2169   nnz           = adi[am] + aoi[am];
2170   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
2171   current_space = free_space;
2172 
2173   /* determine symbolic info for each row of C: */
2174   for (i=0;i<pN;i++) {
2175     ptnzi  = pti[i+1] - pti[i];
2176     nprow_loc = 0; nprow_oth = 0;
2177     /* i-th row of symbolic P_loc^T*A_loc: */
2178     for (j=0;j<ptnzi;j++) {
2179       arow = *ptJ++;
2180       /* diagonal portion of A */
2181       anzj = adi[arow+1] - adi[arow];
2182       ajj  = adj + adi[arow];
2183       for (k=0;k<anzj;k++) {
2184         col = ajj[k]+prstart;
2185         if (!ptadenserow_loc[col]) {
2186           ptadenserow_loc[col]    = -1;
2187           ptasparserow_loc[nprow_loc++] = col;
2188         }
2189       }
2190       /* off-diagonal portion of A */
2191       anzj = aoi[arow+1] - aoi[arow];
2192       ajj  = aoj + aoi[arow];
2193       for (k=0;k<anzj;k++) {
2194         col = a->garray[ajj[k]];  /* global col */
2195         if (col < cstart){
2196           col = ajj[k];
2197         } else if (col >= cend){
2198           col = ajj[k] + pm;
2199         } else {
2200           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
2201         }
2202         if (!ptadenserow_oth[col]) {
2203           ptadenserow_oth[col]    = -1;
2204           ptasparserow_oth[nprow_oth++] = col;
2205         }
2206       }
2207     }
2208 
2209     /* using symbolic info of local PtA, determine symbolic info for row of C: */
2210     cnzi   = 0;
2211     /* rows of P_loc */
2212     ptaj = ptasparserow_loc;
2213     for (j=0; j<nprow_loc; j++) {
2214       prow = *ptaj++;
2215       prow -= prstart; /* rm */
2216       pnzj = pi_loc[prow+1] - pi_loc[prow];
2217       pjj  = pj_loc + pi_loc[prow];
2218       /* add non-zero cols of P into the sorted linked list lnk */
2219       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
2220       cnzi += nlnk;
2221     }
2222     /* rows of P_oth */
2223     ptaj = ptasparserow_oth;
2224     for (j=0; j<nprow_oth; j++) {
2225       prow = *ptaj++;
2226       if (prow >= prend) prow -= pm; /* rm */
2227       pnzj = pi_oth[prow+1] - pi_oth[prow];
2228       pjj  = pj_oth + pi_oth[prow];
2229       /* add non-zero cols of P into the sorted linked list lnk */
2230       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
2231       cnzi += nlnk;
2232     }
2233 
2234     /* If free space is not available, make more free space */
2235     /* Double the amount of total space in the list */
2236     if (current_space->local_remaining<cnzi) {
2237       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
2238     }
2239 
2240     /* Copy data into free space, and zero out denserows */
2241     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
2242     current_space->array           += cnzi;
2243     current_space->local_used      += cnzi;
2244     current_space->local_remaining -= cnzi;
2245 
2246     for (j=0;j<nprow_loc; j++) {
2247       ptadenserow_loc[ptasparserow_loc[j]] = 0;
2248     }
2249     for (j=0;j<nprow_oth; j++) {
2250       ptadenserow_oth[ptasparserow_oth[j]] = 0;
2251     }
2252 
2253     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
2254     /*        For now, we will recompute what is needed. */
2255     ci[i+1] = ci[i] + cnzi;
2256   }
2257   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
2258   /* Allocate space for cj, initialize cj, and */
2259   /* destroy list of free space and other temporary array(s) */
2260   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
2261   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
2262   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
2263   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2264 
2265   /* Allocate space for ca */
2266   ierr = PetscMalloc((ci[pN]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
2267   ierr = PetscMemzero(ca,(ci[pN]+1)*sizeof(MatScalar));CHKERRQ(ierr);
2268 
2269   /* put together the new matrix */
2270   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,pN,pN,ci,cj,ca,C);CHKERRQ(ierr);
2271 
2272   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
2273   /* Since these are PETSc arrays, change flags to free them as necessary. */
2274   c = (Mat_SeqAIJ *)((*C)->data);
2275   c->freedata = PETSC_TRUE;
2276   c->nonew    = 0;
2277 
2278   /* Clean up. */
2279   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
2280 
2281   ierr = PetscLogEventEnd(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
2282   PetscFunctionReturn(0);
2283 }
2284