xref: /petsc/src/mat/impls/aij/seq/matptap.c (revision 418422e8e2e0b70247f228c4b0535e8fb8cd3653)
1 /*
2   Defines projective product routines where A is a AIJ matrix
3           C = P^T * A * P
4 */
5 
6 #include "src/mat/impls/aij/seq/aij.h"   /*I "petscmat.h" I*/
7 #include "src/mat/utils/freespace.h"
8 #include "src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatPtAP"
13 /*@
14    MatPtAP - Creates the matrix projection C = P^T * A * P
15 
16    Collective on Mat
17 
18    Input Parameters:
19 +  A - the matrix
20 .  P - the projection matrix
21 .  scall - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
22 -  fill - expected fill as ratio of nnz(C)/(nnz(A) + nnz(P))
23 
24    Output Parameters:
25 .  C - the product matrix
26 
27    Notes:
28    C will be created and must be destroyed by the user with MatDestroy().
29 
30    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
31    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.
32 
33    Level: intermediate
34 
35 .seealso: MatPtAPSymbolic(),MatPtAPNumeric(),MatMatMult()
36 @*/
37 PetscErrorCode MatPtAP(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
38 {
39   PetscErrorCode ierr;
40   PetscErrorCode (*fA)(Mat,Mat,MatReuse,PetscReal,Mat *);
41   PetscErrorCode (*fP)(Mat,Mat,MatReuse,PetscReal,Mat *);
42 
43   PetscFunctionBegin;
44   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
45   PetscValidType(A,1);
46   MatPreallocated(A);
47   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
48   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
49   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
50   PetscValidType(P,2);
51   MatPreallocated(P);
52   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
53   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
54   PetscValidPointer(C,3);
55 
56   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
57 
58   if (fill <=0.0) SETERRQ1(PETSC_ERR_ARG_SIZ,"fill=%g must be > 0.0",fill);
59 
60   /* For now, we do not dispatch based on the type of A and P */
61   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
62   fA = A->ops->ptap;
63   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for A of type %s",A->type_name);
64   fP = P->ops->ptap;
65   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for P of type %s",P->type_name);
66   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAP requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
67 
68   ierr = PetscLogEventBegin(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
69   ierr = (*fA)(A,P,scall,fill,C);CHKERRQ(ierr);
70   ierr = PetscLogEventEnd(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
71   PetscFunctionReturn(0);
72 }
73 
74 EXTERN PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,PetscReal,Mat*);
75 EXTERN PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,Mat C);
76 
77 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
78 #undef __FUNCT__
79 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
80 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
81 {
82   PetscErrorCode       ierr;
83   Mat_MatMatMultMPI    *ptap;
84   PetscObjectContainer container;
85 
86   PetscFunctionBegin;
87   ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
88   if (container) {
89     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
90     ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr);
91     ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr);
92     ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr);
93     ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr);
94 
95     ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
96     ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr);
97   }
98   ierr = PetscFree(ptap);CHKERRQ(ierr);
99 
100   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
101   PetscFunctionReturn(0);
102 }
103 
104 #undef __FUNCT__
105 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
106 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
107 {
108   PetscErrorCode       ierr;
109   Mat_MatMatMultMPI    *ptap;
110   PetscObjectContainer container;
111 
112   PetscFunctionBegin;
113   if (scall == MAT_INITIAL_MATRIX){
114     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
115     ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr);
116 
117     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
118     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
119 
120     /* get P_loc by taking all local rows of P */
121     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
122 
123     /* attach the supporting struct to P for reuse */
124     P->ops->destroy  = MatDestroy_MPIAIJ_PtAP;
125     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
126     ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr);
127     ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr);
128 
129     /* now, compute symbolic local P^T*A*P */
130     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */
131     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
132   } else if (scall == MAT_REUSE_MATRIX){
133     ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
134     if (container) {
135       ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
136     } else {
137       SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
138     }
139 
140     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
141     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
142 
143     /* get P_loc by taking all local rows of P */
144     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
145 
146   } else {
147     SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall);
148   }
149   /* now, compute numeric local P^T*A*P */
150   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
151   ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
152   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
153 
154   PetscFunctionReturn(0);
155 }
156 
157 #undef __FUNCT__
158 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
159 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
160 {
161   PetscErrorCode       ierr;
162   Mat                  P_loc,P_oth;
163   Mat_MatMatMultMPI    *ptap;
164   PetscObjectContainer container;
165   FreeSpaceList        free_space=PETSC_NULL,current_space=PETSC_NULL;
166   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data;
167   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
168   Mat_SeqAIJ           *p_loc,*p_oth;
169   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*ajj,*pjj;
170   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
171   PetscInt             pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji,*ptaj;
172   PetscInt             aN=A->N,am=A->m,pN=P->N;
173   PetscInt             i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi;
174   PetscBT              lnkbt;
175   PetscInt             prstart,prend,nprow_loc,nprow_oth;
176   PetscInt             *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
177 
178   MPI_Comm             comm=A->comm;
179   Mat                  B_mpi;
180   PetscMPIInt          size,rank,tagi,tagj,*len_s,*len_si,*len_ri;
181   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
182   PetscInt             len,proc,*dnz,*onz,*owners;
183   PetscInt             anzi,*bi,*bj,bnzi,nspacedouble=0;
184   PetscInt             nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
185   MPI_Request          *si_waits,*sj_waits,*ri_waits,*rj_waits;
186   MPI_Status           *status;
187   Mat_Merge_SeqsToMPI  *merge;
188 
189   PetscFunctionBegin;
190   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
191   if (container) {
192     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
193   } else {
194     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
195   }
196   /* compute symbolic C_seq = P_loc^T * A_loc * P_seq */
197   /*--------------------------------------------------*/
198   /* get data from P_loc and P_oth */
199   P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart;
200   p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
201   pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
202   prend = prstart + pm;
203 
204   /* get ij structure of P_loc^T */
205   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
206   ptJ=ptj;
207 
208   /* Allocate ci array, arrays for fill computation and */
209   /* free space for accumulating nonzero column info */
210   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
211   ci[0] = 0;
212 
213   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
214   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
215   ptasparserow_loc = ptadenserow_loc + aN;
216   ptadenserow_oth  = ptasparserow_loc + aN;
217   ptasparserow_oth = ptadenserow_oth + aN;
218 
219   /* create and initialize a linked list */
220   nlnk = pN+1;
221   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
222 
223   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
224   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
225   nnz           = adi[am] + aoi[am];
226   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
227   current_space = free_space;
228 
229   /* determine symbolic info for each row of C: */
230   for (i=0;i<pN;i++) {
231     ptnzi  = pti[i+1] - pti[i];
232     nprow_loc = 0; nprow_oth = 0;
233     /* i-th row of symbolic P_loc^T*A_loc: */
234     for (j=0;j<ptnzi;j++) {
235       arow = *ptJ++;
236       /* diagonal portion of A */
237       anzj = adi[arow+1] - adi[arow];
238       ajj  = adj + adi[arow];
239       for (k=0;k<anzj;k++) {
240         col = ajj[k]+prstart;
241         if (!ptadenserow_loc[col]) {
242           ptadenserow_loc[col]    = -1;
243           ptasparserow_loc[nprow_loc++] = col;
244         }
245       }
246       /* off-diagonal portion of A */
247       anzj = aoi[arow+1] - aoi[arow];
248       ajj  = aoj + aoi[arow];
249       for (k=0;k<anzj;k++) {
250         col = a->garray[ajj[k]];  /* global col */
251         if (col < cstart){
252           col = ajj[k];
253         } else if (col >= cend){
254           col = ajj[k] + pm;
255         } else {
256           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
257         }
258         if (!ptadenserow_oth[col]) {
259           ptadenserow_oth[col]    = -1;
260           ptasparserow_oth[nprow_oth++] = col;
261         }
262       }
263     }
264 
265     /* using symbolic info of local PtA, determine symbolic info for row of C: */
266     cnzi   = 0;
267     /* rows of P_loc */
268     ptaj = ptasparserow_loc;
269     for (j=0; j<nprow_loc; j++) {
270       prow = *ptaj++;
271       prow -= prstart; /* rm */
272       pnzj = pi_loc[prow+1] - pi_loc[prow];
273       pjj  = pj_loc + pi_loc[prow];
274       /* add non-zero cols of P into the sorted linked list lnk */
275       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
276       cnzi += nlnk;
277     }
278     /* rows of P_oth */
279     ptaj = ptasparserow_oth;
280     for (j=0; j<nprow_oth; j++) {
281       prow = *ptaj++;
282       if (prow >= prend) prow -= pm; /* rm */
283       pnzj = pi_oth[prow+1] - pi_oth[prow];
284       pjj  = pj_oth + pi_oth[prow];
285       /* add non-zero cols of P into the sorted linked list lnk */
286       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
287       cnzi += nlnk;
288     }
289 
290     /* If free space is not available, make more free space */
291     /* Double the amount of total space in the list */
292     if (current_space->local_remaining<cnzi) {
293       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
294     }
295 
296     /* Copy data into free space, and zero out denserows */
297     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
298     current_space->array           += cnzi;
299     current_space->local_used      += cnzi;
300     current_space->local_remaining -= cnzi;
301 
302     for (j=0;j<nprow_loc; j++) {
303       ptadenserow_loc[ptasparserow_loc[j]] = 0;
304     }
305     for (j=0;j<nprow_oth; j++) {
306       ptadenserow_oth[ptasparserow_oth[j]] = 0;
307     }
308 
309     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
310     /*        For now, we will recompute what is needed. */
311     ci[i+1] = ci[i] + cnzi;
312   }
313   /* Clean up. */
314   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
315 
316   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
317   /* Allocate space for cj, initialize cj, and */
318   /* destroy list of free space and other temporary array(s) */
319   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
320   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
321   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
322   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
323 
324   /* Allocate space for ca */
325   /*
326   ierr = PetscMalloc((ci[pN]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
327   ierr = PetscMemzero(ca,(ci[pN]+1)*sizeof(MatScalar));CHKERRQ(ierr);
328   */
329 
330   /* add C_seq into mpi C              */
331   /*-----------------------------------*/
332   free_space=PETSC_NULL; current_space=PETSC_NULL;
333 
334   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
335   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
336 
337   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
338 
339 
340   /* determine row ownership */
341   /*---------------------------------------------------------*/
342   ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr);
343   ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr);
344   ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr);
345   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
346 
347   /* determine the number of messages to send, their lengths */
348   /*---------------------------------------------------------*/
349   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
350   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
351   len_s  = merge->len_s;
352   len = 0;  /* length of buf_si[] */
353   merge->nsend = 0;
354   for (proc=0; proc<size; proc++){
355     len_si[proc] = 0;
356     if (proc == rank){
357       len_s[proc] = 0;
358     } else {
359       len_si[proc] = owners[proc+1] - owners[proc] + 1;
360       len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of rows to be sent to [proc] */
361     }
362     if (len_s[proc]) {
363       merge->nsend++;
364       nrows = 0;
365       for (i=owners[proc]; i<owners[proc+1]; i++){
366         if (ci[i+1] > ci[i]) nrows++;
367       }
368       len_si[proc] = 2*(nrows+1);
369       len += len_si[proc];
370     }
371   }
372 
373   /* determine the number and length of messages to receive for ij-structure */
374   /*-------------------------------------------------------------------------*/
375   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
376   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
377 
378   /* post the Irecv of j-structure */
379   /*-------------------------------*/
380   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr);
381   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr);
382 
383   /* post the Isend of j-structure */
384   /*--------------------------------*/
385   ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr);
386   sj_waits = si_waits + merge->nsend;
387 
388   for (proc=0, k=0; proc<size; proc++){
389     if (!len_s[proc]) continue;
390     i = owners[proc];
391     ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr);
392     k++;
393   }
394 
395   /* receives and sends of j-structure are complete */
396   /*------------------------------------------------*/
397   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
398   ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr);
399   ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr);
400 
401   /* send and recv i-structure */
402   /*---------------------------*/
403   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr);
404   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr);
405 
406   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
407   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
408   for (proc=0,k=0; proc<size; proc++){
409     if (!len_s[proc]) continue;
410     /* form outgoing message for i-structure:
411          buf_si[0]:                 nrows to be sent
412                [1:nrows]:           row index (global)
413                [nrows+1:2*nrows+1]: i-structure index
414     */
415     /*-------------------------------------------*/
416     nrows = len_si[proc]/2 - 1;
417     buf_si_i    = buf_si + nrows+1;
418     buf_si[0]   = nrows;
419     buf_si_i[0] = 0;
420     nrows = 0;
421     for (i=owners[proc]; i<owners[proc+1]; i++){
422       anzi = ci[i+1] - ci[i];
423       if (anzi) {
424         buf_si_i[nrows+1] = buf_si_i[nrows] + anzi; /* i-structure */
425         buf_si[nrows+1] = i-owners[proc]; /* local row index */
426         nrows++;
427       }
428     }
429     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr);
430     k++;
431     buf_si += len_si[proc];
432   }
433 
434   ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr);
435   ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr);
436 
437   ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr);
438   for (i=0; i<merge->nrecv; i++){
439     ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI:   recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr);
440   }
441 
442   ierr = PetscFree(len_si);CHKERRQ(ierr);
443   ierr = PetscFree(len_ri);CHKERRQ(ierr);
444   ierr = PetscFree(rj_waits);CHKERRQ(ierr);
445   ierr = PetscFree(si_waits);CHKERRQ(ierr);
446   ierr = PetscFree(ri_waits);CHKERRQ(ierr);
447   ierr = PetscFree(buf_s);CHKERRQ(ierr);
448   ierr = PetscFree(status);CHKERRQ(ierr);
449 
450   /* compute a local seq matrix in each processor */
451   /*----------------------------------------------*/
452   /* allocate bi array and free space for accumulating nonzero column info */
453   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
454   bi[0] = 0;
455 
456   /* create and initialize a linked list */
457   nlnk = pN+1;
458   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
459 
460   /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */
461   len = 0;
462   len  = ci[owners[rank+1]] - ci[owners[rank]];
463   ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr);
464   current_space = free_space;
465 
466   /* determine symbolic info for each local row */
467   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
468   nextrow = buf_ri_k + merge->nrecv;
469   nextci  = nextrow + merge->nrecv;
470   for (k=0; k<merge->nrecv; k++){
471     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
472     nrows = *buf_ri_k[k];
473     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
474     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
475   }
476 
477   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
478   len = 0;
479   for (i=0; i<pn; i++) {
480     bnzi   = 0;
481     /* add local non-zero cols of this proc's C_seq into lnk */
482     arow   = owners[rank] + i;
483     anzi   = ci[arow+1] - ci[arow];
484     cji    = cj + ci[arow];
485     ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
486     bnzi += nlnk;
487     /* add received col data into lnk */
488     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
489       if (i == *nextrow[k]) { /* i-th row */
490         anzi = *(nextci[k]+1) - *nextci[k];
491         cji  = buf_rj[k] + *nextci[k];
492         ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
493         bnzi += nlnk;
494         nextrow[k]++; nextci[k]++;
495       }
496     }
497     if (len < bnzi) len = bnzi;  /* =max(bnzi) */
498 
499     /* if free space is not available, make more free space */
500     if (current_space->local_remaining<bnzi) {
501       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
502       nspacedouble++;
503     }
504     /* copy data into free space, then initialize lnk */
505     ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
506     ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr);
507 
508     current_space->array           += bnzi;
509     current_space->local_used      += bnzi;
510     current_space->local_remaining -= bnzi;
511 
512     bi[i+1] = bi[i] + bnzi;
513   }
514 
515   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
516 
517   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
518   ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
519   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
520 
521   /* create symbolic parallel matrix B_mpi */
522   /*---------------------------------------*/
523   ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr);
524   ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr);
525   ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr);
526   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
527 
528   /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */
529   B_mpi->assembled     = PETSC_FALSE;
530   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_SeqsToMPI;
531   merge->bi            = bi;
532   merge->bj            = bj;
533   merge->ci            = ci;
534   merge->cj            = cj;
535   merge->buf_ri        = buf_ri;
536   merge->buf_rj        = buf_rj;
537   merge->C_seq         = PETSC_NULL;
538 
539   /* attach the supporting struct to B_mpi for reuse */
540   ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
541   ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr);
542   ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr);
543   *C = B_mpi;
544 
545   PetscFunctionReturn(0);
546 
547 #ifdef TOBEREMOVED
548   PetscErrorCode       ierr;
549   Mat                  C_seq;
550   Mat_MatMatMultMPI    *ptap;
551   PetscObjectContainer container;
552 
553   PetscFunctionBegin;
554 
555   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
556   if (container) {
557     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
558   } else {
559     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
560   }
561   /* compute C_seq = P_loc^T * A_loc * P_seq */
562   ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(A,ptap->B_loc,ptap->B_oth,ptap->brstart,fill,&C_seq);CHKERRQ(ierr);
563 
564   /* add C_seq into mpi C */
565   ierr = MatMerge_SeqsToMPISymbolic(A->comm,C_seq,P->n,P->n,C);CHKERRQ(ierr);
566 
567   PetscFunctionReturn(0);
568 #endif /* TOBEREMOVED */
569 }
570 
571 #undef __FUNCT__
572 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
573 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
574 {
575   PetscErrorCode       ierr;
576   Mat_Merge_SeqsToMPI  *merge;
577   Mat_MatMatMultMPI    *ptap;
578   PetscObjectContainer cont_merge,cont_ptap;
579 
580   PetscInt       flops=0;
581   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
582   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
583   Mat_SeqAIJ     *p_loc,*p_oth;
584   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col;
585   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pjj;
586   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
587   PetscInt       *cjj;
588   MatScalar      *ada=ad->a,*aoa=ao->a,*apa,*paj,*cseqa,*caj;
589   MatScalar      *pa_loc,*pA,*pa_oth;
590   PetscInt       am=A->m,cN=C->N;
591 
592   MPI_Comm             comm=C->comm;
593   PetscMPIInt          size,rank,taga,*len_s;
594   PetscInt             *owners;
595   PetscInt             proc;
596   PetscInt             **buf_ri,**buf_rj;
597   PetscInt             cseqnzi,*bj_i,*bi,*bj,cseqrow,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */
598   PetscInt             nrows,**buf_ri_k,**nextrow,**nextcseqi;
599   MPI_Request          *s_waits,*r_waits;
600   MPI_Status           *status;
601   MatScalar            **abuf_r,*ba_i;
602   PetscInt             *cseqi,*cseqj;
603   PetscInt             *cseqj_tmp;
604   MatScalar            *cseqa_tmp;
605 
606   PetscFunctionBegin;
607   ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr);
608   if (cont_merge) {
609     ierr  = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr);
610   } else {
611     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
612   }
613   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr);
614   if (cont_ptap) {
615     ierr  = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr);
616   } else {
617     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
618   }
619   /* get data from symbolic products */
620   p_loc=(Mat_SeqAIJ*)(ptap->B_loc)->data;
621   p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data;
622   pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a; pA=pa_loc;
623   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
624 
625   cseqi = merge->ci; cseqj=merge->cj;
626   ierr  = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr);
627   ierr  = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
628 
629   /* Allocate temporary array for storage of one row of A*P */
630   ierr = PetscMalloc(cN*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
631   ierr = PetscMemzero(apa,cN*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
632   apj      = (PetscInt *)(apa + cN);
633   apjdense = apj + cN;
634 
635   /* Clear old values in C_Seq */
636   ierr = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
637 
638   for (i=0;i<am;i++) {
639     /* Form i-th sparse row of A*P */
640      apnzj = 0;
641     /* diagonal portion of A */
642     anzi  = adi[i+1] - adi[i];
643     for (j=0;j<anzi;j++) {
644       prow = *adj;
645       adj++;
646       pnzj = pi_loc[prow+1] - pi_loc[prow];
647       pjj  = pj_loc + pi_loc[prow];
648       paj  = pa_loc + pi_loc[prow];
649       for (k=0;k<pnzj;k++) {
650         if (!apjdense[pjj[k]]) {
651           apjdense[pjj[k]] = -1;
652           apj[apnzj++]     = pjj[k];
653         }
654         apa[pjj[k]] += (*ada)*paj[k];
655       }
656       flops += 2*pnzj;
657       ada++;
658     }
659     /* off-diagonal portion of A */
660     anzi  = aoi[i+1] - aoi[i];
661     for (j=0;j<anzi;j++) {
662       col = a->garray[*aoj];
663       if (col < cstart){
664         prow = *aoj;
665       } else if (col >= cend){
666         prow = *aoj;
667       } else {
668         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
669       }
670       aoj++;
671       pnzj = pi_oth[prow+1] - pi_oth[prow];
672       pjj  = pj_oth + pi_oth[prow];
673       paj  = pa_oth + pi_oth[prow];
674       for (k=0;k<pnzj;k++) {
675         if (!apjdense[pjj[k]]) {
676           apjdense[pjj[k]] = -1;
677           apj[apnzj++]     = pjj[k];
678         }
679         apa[pjj[k]] += (*aoa)*paj[k];
680       }
681       flops += 2*pnzj;
682       aoa++;
683     }
684     /* Sort the j index array for quick sparse axpy. */
685     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
686 
687     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
688     pnzi = pi_loc[i+1] - pi_loc[i];
689     for (j=0;j<pnzi;j++) {
690       nextap = 0;
691       crow   = *pJ++;
692       cjj    = cseqj + cseqi[crow];
693       caj    = cseqa + cseqi[crow];
694       /* Perform sparse axpy operation.  Note cjj includes apj. */
695       for (k=0;nextap<apnzj;k++) {
696         if (cjj[k]==apj[nextap]) {
697           caj[k] += (*pA)*apa[apj[nextap++]];
698         }
699       }
700       flops += 2*apnzj;
701       pA++;
702     }
703 
704     /* Zero the current row info for A*P */
705     for (j=0;j<apnzj;j++) {
706       apa[apj[j]]      = 0.;
707       apjdense[apj[j]] = 0;
708     }
709   }
710 
711   ierr = PetscFree(apa);CHKERRQ(ierr);
712   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
713 
714   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
715   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
716 
717   bi     = merge->bi;
718   bj     = merge->bj;
719   buf_ri = merge->buf_ri;
720   buf_rj = merge->buf_rj;
721 
722   ierr   = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
723   ierr   = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
724 
725   /* send and recv matrix values */
726   /*-----------------------------*/
727   len_s  = merge->len_s;
728   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr);
729   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
730 
731   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr);
732   for (proc=0,k=0; proc<size; proc++){
733     if (!len_s[proc]) continue;
734     i = owners[proc];
735     ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
736     k++;
737   }
738 
739   ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);
740   ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);
741   ierr = PetscFree(status);CHKERRQ(ierr);
742 
743   ierr = PetscFree(s_waits);CHKERRQ(ierr);
744   ierr = PetscFree(r_waits);CHKERRQ(ierr);
745 
746   /* insert mat values of mpimat */
747   /*----------------------------*/
748   ierr = PetscMalloc(cN*sizeof(MatScalar),&ba_i);CHKERRQ(ierr);
749   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
750   nextrow = buf_ri_k + merge->nrecv;
751   nextcseqi  = nextrow + merge->nrecv;
752 
753   for (k=0; k<merge->nrecv; k++){
754     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
755     nrows = *(buf_ri_k[k]);
756     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
757     nextcseqi[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
758   }
759 
760   /* set values of ba */
761   for (i=0; i<C->m; i++) {
762     cseqrow = owners[rank] + i;
763     bj_i = bj+bi[i];  /* col indices of the i-th row of C */
764     bnzi = bi[i+1] - bi[i];
765     ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr);
766 
767     /* add local non-zero vals of this proc's C_seq into ba */
768     cseqnzi = cseqi[cseqrow+1] - cseqi[cseqrow];
769     cseqj_tmp = cseqj + cseqi[cseqrow];
770     cseqa_tmp = cseqa + cseqi[cseqrow];
771     nextcseqj = 0;
772     for (j=0; nextcseqj<cseqnzi; j++){
773       if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
774         ba_i[j] += cseqa_tmp[nextcseqj++];
775       }
776     }
777 
778     /* add received vals into ba */
779     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
780       /* i-th row */
781       if (i == *nextrow[k]) {
782         cseqnzi = *(nextcseqi[k]+1) - *nextcseqi[k];
783         cseqj_tmp   = buf_rj[k] + *(nextcseqi[k]);
784         cseqa_tmp   = abuf_r[k] + *(nextcseqi[k]);
785         nextcseqj = 0;
786         for (j=0; nextcseqj<cseqnzi; j++){
787           if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
788             ba_i[j] += cseqa_tmp[nextcseqj++];
789           }
790         }
791         nextrow[k]++; nextcseqi[k]++;
792       }
793     }
794     ierr = MatSetValues(C,1,&cseqrow,bnzi,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
795   }
796   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
797   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
798 
799   ierr = PetscFree(cseqa);CHKERRQ(ierr);
800   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
801   ierr = PetscFree(ba_i);CHKERRQ(ierr);
802   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
803 
804   PetscFunctionReturn(0);
805 }
806 
807 #ifdef TOBEREMOVED
808 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
809 #undef __FUNCT__
810 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
811 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
812 {
813   PetscErrorCode       ierr;
814   Mat_MatMatMultMPI    *ptap;
815   PetscObjectContainer container;
816 
817   PetscFunctionBegin;
818   ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
819   if (container) {
820     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
821     ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr);
822     ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr);
823     ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr);
824     ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr);
825 
826     ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
827     ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr);
828   }
829   ierr = PetscFree(ptap);CHKERRQ(ierr);
830 
831   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
832   PetscFunctionReturn(0);
833 }
834 
835 #undef __FUNCT__
836 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
837 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
838 {
839   PetscErrorCode       ierr;
840   Mat_MatMatMultMPI    *ptap;
841   PetscObjectContainer container;
842 
843   PetscFunctionBegin;
844   if (scall == MAT_INITIAL_MATRIX){
845     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
846     ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr);
847 
848     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
849     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
850 
851     /* get P_loc by taking all local rows of P */
852     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
853 
854     /* attach the supporting struct to P for reuse */
855     P->ops->destroy  = MatDestroy_MPIAIJ_PtAP;
856     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
857     ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr);
858     ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr);
859 
860     /* now, compute symbolic local P^T*A*P */
861     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */
862     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
863   } else if (scall == MAT_REUSE_MATRIX){
864     ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
865     if (container) {
866       ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
867     } else {
868       SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
869     }
870     /* update P_oth */
871     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
872 
873   } else {
874     SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall);
875   }
876   /* now, compute numeric local P^T*A*P */
877   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
878   ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
879   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
880 
881   PetscFunctionReturn(0);
882 }
883 /* Set symbolic info for i-th row of local product C=P^T*A*P */
884 #define MatPtAPSymbolicLocal_Private(ptM,pti,ptj,ci,cj) 0;\
885 {\
886   /* allocate ci array, arrays for fill computation and */\
887   /* free space for accumulating nonzero column info */\
888   ierr = PetscMalloc((ptM+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);\
889   ci[0] = 0;\
890 \
891   /* set initial free space to be nnz(A) scaled by fill*P->N/PtM. */\
892   /* this should be reasonable if sparsity of PtAP is similar to that of A. */\
893   nnz           = adi[am] + aoi[am];\
894   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/ptM+1),&free_space);\
895   current_space = free_space;\
896 \
897   ptJ=ptj;\
898   for (i=0; i<ptM; i++) {\
899     ptnzi  = pti[i+1] - pti[i];\
900     nprow_loc = 0; nprow_oth = 0;\
901     /* i-th row of symbolic Pt*A: */\
902     for (j=0;j<ptnzi;j++) {\
903       arow = *ptJ++;\
904       /* diagonal portion of A */\
905       anzj = adi[arow+1] - adi[arow];\
906       ajj  = adj + adi[arow];\
907       for (k=0;k<anzj;k++) {\
908         col = ajj[k]+prstart;\
909         if (!ptadenserow_loc[col]) {\
910           ptadenserow_loc[col]    = -1;\
911           ptasparserow_loc[nprow_loc++] = col;\
912         }\
913       }\
914       /* off-diagonal portion of A */\
915       anzj = aoi[arow+1] - aoi[arow];\
916       ajj  = aoj + aoi[arow];\
917       for (k=0;k<anzj;k++) {\
918         col = a->garray[ajj[k]];  /* global col */\
919         if (col < cstart){\
920           col = ajj[k];\
921         } else if (col >= cend){\
922           col = ajj[k] + pm;\
923         } else {\
924           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");\
925         }\
926         if (!ptadenserow_oth[col]) {\
927           ptadenserow_oth[col]    = -1;\
928           ptasparserow_oth[nprow_oth++] = col;\
929         }\
930       }\
931     }\
932     /* determine symbolic info for row of C_seq: */\
933     cnzi   = 0;\
934     /* rows of P_loc */\
935     ptaj = ptasparserow_loc;\
936     for (j=0; j<nprow_loc; j++) {\
937       prow = *ptaj++; \
938       prow -= prstart; /* rm */\
939       pnzj = pi_loc[prow+1] - pi_loc[prow];\
940       pjj  = pj_loc + pi_loc[prow];\
941       /* add non-zero cols of P into the sorted linked list lnk */\
942       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);\
943       cnzi += nlnk;\
944     }\
945     /* rows of P_oth */\
946     ptaj = ptasparserow_oth;\
947     for (j=0; j<nprow_oth; j++) {\
948       prow = *ptaj++; \
949       if (prow >= prend) prow -= pm; /* rm */\
950       pnzj = pi_oth[prow+1] - pi_oth[prow];\
951       pjj  = pj_oth + pi_oth[prow];\
952       /* add non-zero cols of P into the sorted linked list lnk */\
953       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);\
954       cnzi += nlnk;\
955     }\
956    \
957     /* If free space is not available, make more free space */\
958     /* Double the amount of total space in the list */\
959     if (current_space->local_remaining<cnzi) {\
960       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);\
961     }\
962 \
963     /* Copy data into free space, and zero out denserows */\
964     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);\
965     current_space->array           += cnzi;\
966     current_space->local_used      += cnzi;\
967     current_space->local_remaining -= cnzi;\
968    \
969     for (j=0;j<nprow_loc; j++) {\
970       ptadenserow_loc[ptasparserow_loc[j]] = 0;\
971     }\
972     for (j=0;j<nprow_oth; j++) {\
973       ptadenserow_oth[ptasparserow_oth[j]] = 0;\
974     }\
975     \
976     /* Aside: Perhaps we should save the pta info for the numerical factorization. */\
977     /*        For now, we will recompute what is needed. */ \
978     ci[i+1] = ci[i] + cnzi;\
979   }\
980   /* nnz is now stored in ci[ptm], column indices are in the list of free space */\
981   /* Allocate space for cj, initialize cj, and */\
982   /* destroy list of free space and other temporary array(s) */\
983   ierr = PetscMalloc((ci[ptM]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);\
984   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);\
985 }
986 #undef __FUNCT__
987 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
988 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
989 {
990   PetscErrorCode       ierr;
991   Mat                  P_loc,P_oth;
992   Mat_MatMatMultMPI    *ptap;
993   PetscObjectContainer container;
994   FreeSpaceList        free_space=PETSC_NULL,current_space;
995   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
996   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,
997                        *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
998   Mat_SeqAIJ           *p_loc,*p_oth;
999   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*ajj,*pjj,*rmap=p->garray;
1000   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
1001   PetscInt             pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji,*ptaj;
1002   PetscInt             aN=A->N,am=A->m,pN=P->N;
1003   PetscInt             i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi,row;
1004   PetscBT              lnkbt;
1005   PetscInt             prstart,prend,nprow_loc,nprow_oth;
1006   PetscInt             *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
1007 
1008   MPI_Comm             comm=A->comm;
1009   Mat                  B_mpi;
1010   PetscMPIInt          size,rank,tagi,tagj,*len_s,*len_si,*len_ri;
1011   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
1012   PetscInt             len,proc,*dnz,*onz,*owners;
1013   PetscInt             anzi,*bi,*bj,bnzi,nspacedouble=0;
1014   PetscInt             nrows,tnrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1015   MPI_Request          *si_waits,*sj_waits,*ri_waits,*rj_waits;
1016   MPI_Status           *status;
1017   Mat_Merge_SeqsToMPI  *merge;
1018   PetscInt             *ptdi,*ptdj,*cdi,*cdj;
1019   PetscInt             *ptoi,*ptoj,*coi,*coj;
1020 
1021   PetscFunctionBegin;
1022   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
1023   if (container) {
1024     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
1025   } else {
1026     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
1027   }
1028 
1029   /* determine row ownership of C */
1030   /*---------------------------------------------------------*/
1031   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1032   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1033   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
1034 
1035   ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1036   ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr);
1037   ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr);
1038   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
1039 
1040   /* get data from P_loc and P_oth */
1041   P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart;
1042   p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
1043   pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
1044   prend = prstart + pm;
1045 
1046   /* compute symbolic C_seq = P_loc^T * A_loc * P_seq */
1047   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
1048   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
1049   ptasparserow_loc = ptadenserow_loc + aN;
1050   ptadenserow_oth  = ptasparserow_loc + aN;
1051   ptasparserow_oth = ptadenserow_oth + aN;
1052 
1053   /* create and initialize a linked list */
1054   nlnk = pN+1;
1055   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1056 
1057   /* Pt = P_loc^T */
1058   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1059   ierr = MatPtAPSymbolicLocal_Private(pN,pti,ptj,ci,cj);CHKERRQ(ierr);
1060 
1061   /* Pt = Pd^T - diagonal portion of P_loc */
1062   ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&ptdi,&ptdj);CHKERRQ(ierr);
1063   ierr = MatPtAPSymbolicLocal_Private(pn,ptdi,ptdj,cdi,cdj);CHKERRQ(ierr);
1064   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&ptdi,&ptdj);CHKERRQ(ierr);
1065 #ifdef NEW
1066   /* Pt = Po^T - off-diagonal portion of P_loc */
1067   ierr = MatGetSymbolicTranspose_SeqAIJ(p->B,&ptoi,&ptoj);CHKERRQ(ierr);
1068   ierr = MatPtAPSymbolicLocal_Private(tnrows,ptoi,ptoj,coi,coj);CHKERRQ(ierr);
1069   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&ptoi,&ptoj);CHKERRQ(ierr);
1070 #endif
1071   /* clean up */
1072   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
1073   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1074   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1075 
1076   /* determine the number of messages to send, their lengths */
1077   /*---------------------------------------------------------*/
1078   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
1079   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1080   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
1081   len_s  = merge->len_s;
1082   len = 0;  /* max length of buf_si[] */
1083   merge->nsend = 0;
1084   tnrows = (p->B)->N; /* total num of rows to be sent to other processors */
1085   proc = 0;
1086   for (i=0; i<tnrows; i++){
1087     while (rmap[i] >= owners[proc+1]) proc++;
1088     len_si[proc]++;
1089   }
1090   for (proc=0; proc<size; proc++){
1091     len_s[proc] = 0;
1092     if (len_si[proc]){
1093       merge->nsend++;
1094       len_si[proc] = 2*(len_si[proc] + 1);
1095       len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of col indices to be sent to [proc] */
1096       len += len_si[proc];
1097     }
1098   }
1099 
1100   /* determine the number and length of messages to receive for ij-structure */
1101   /*-------------------------------------------------------------------------*/
1102   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1103   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1104 
1105   /* post the Irecv of j-structure */
1106   /*-------------------------------*/
1107   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr);
1108   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr);
1109 
1110   /* post the Isend of j-structure */
1111   /*--------------------------------*/
1112   ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr);
1113   sj_waits = si_waits + merge->nsend;
1114 
1115   for (proc=0, k=0; proc<size; proc++){
1116     if (!len_s[proc]) continue;
1117     i = owners[proc];
1118     ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr);
1119     k++;
1120   }
1121 
1122   /* receives and sends of j-structure are complete */
1123   /*------------------------------------------------*/
1124   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
1125   ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr);
1126   ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr);
1127 
1128   /* send and recv i-structure */
1129   /*---------------------------*/
1130   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr);
1131   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr);
1132 
1133   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
1134   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1135 
1136   j = 0; /* row index to be sent */
1137   for (proc=0,k=0; proc<size; proc++){
1138     if (!len_s[proc]) continue;
1139     /* form outgoing message for i-structure:
1140          buf_si[0]:                 nrows to be sent
1141                [1:nrows]:           row index (global)
1142                [nrows+1:2*nrows+1]: i-structure index
1143     */
1144     /*-------------------------------------------*/
1145     nrows = len_si[proc]/2 - 1;
1146     buf_si_i    = buf_si + nrows+1;
1147     buf_si[0]   = nrows;
1148     buf_si_i[0] = 0;
1149     for (i=0; i<nrows; i++){
1150       row = rmap[j++];
1151       anzi = ci[row+1] - ci[row];
1152       buf_si_i[i+1] = buf_si_i[i] + anzi; /* i-structure */
1153       buf_si[i+1]   = row - owners[proc]; /* local row index */
1154     }
1155     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr);
1156     k++;
1157     buf_si += len_si[proc];
1158   }
1159 
1160   ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr);
1161   ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr);
1162 
1163   ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr);
1164   for (i=0; i<merge->nrecv; i++){
1165     ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI:   recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr);
1166   }
1167 
1168   ierr = PetscFree(len_si);CHKERRQ(ierr);
1169   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1170   ierr = PetscFree(rj_waits);CHKERRQ(ierr);
1171   ierr = PetscFree(si_waits);CHKERRQ(ierr);
1172   ierr = PetscFree(ri_waits);CHKERRQ(ierr);
1173   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1174   ierr = PetscFree(status);CHKERRQ(ierr);
1175 
1176   /* compute a local seq matrix in each processor */
1177   /*----------------------------------------------*/
1178   /* allocate bi array and free space for accumulating nonzero column info */
1179   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
1180   bi[0] = 0;
1181 
1182   /* create and initialize a linked list */
1183   nlnk = pN+1;
1184   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1185 
1186   /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */
1187   len = 0;
1188   len  = ci[owners[rank+1]] - ci[owners[rank]];
1189   free_space=PETSC_NULL;
1190   ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr);
1191   current_space = free_space;
1192 
1193   /* determine symbolic info for each local row of C */
1194   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
1195   nextrow = buf_ri_k + merge->nrecv;
1196   nextci  = nextrow + merge->nrecv;
1197   for (k=0; k<merge->nrecv; k++){
1198     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1199     nrows = *buf_ri_k[k];
1200     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1201     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
1202   }
1203 
1204   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
1205   len = 0;
1206   for (i=0; i<pn; i++) {
1207     bnzi = 0;
1208     /* add local non-zero cols of this proc's C_seq into lnk */
1209     anzi = cdi[i+1] - cdi[i];
1210     cji  = cdj + cdi[i];
1211     ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1212     bnzi += nlnk;
1213     /* add received col data into lnk */
1214     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
1215       if (i == *nextrow[k]) { /* i-th row */
1216         anzi = *(nextci[k]+1) - *nextci[k];
1217         cji  = buf_rj[k] + *nextci[k];
1218         ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1219         bnzi += nlnk;
1220         nextrow[k]++; nextci[k]++;
1221       }
1222     }
1223     if (len < bnzi) len = bnzi;  /* =max(bnzi) */
1224 
1225     /* if free space is not available, make more free space */
1226     if (current_space->local_remaining<bnzi) {
1227       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1228       nspacedouble++;
1229     }
1230     /* copy data into free space, then initialize lnk */
1231     ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1232     ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr);
1233 
1234     current_space->array           += bnzi;
1235     current_space->local_used      += bnzi;
1236     current_space->local_remaining -= bnzi;
1237 
1238     bi[i+1] = bi[i] + bnzi;
1239   }
1240 
1241   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
1242 
1243   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
1244   ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1245   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1246 
1247   /* create symbolic parallel matrix B_mpi */
1248   /*---------------------------------------*/
1249   ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr);
1250   ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr);
1251   ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr);
1252   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1253 
1254   /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */
1255   B_mpi->assembled     = PETSC_FALSE;
1256   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_SeqsToMPI;
1257   merge->bi            = bi;
1258   merge->bj            = bj;
1259   merge->ci            = ci;
1260   merge->cj            = cj;
1261   merge->buf_ri        = buf_ri;
1262   merge->buf_rj        = buf_rj;
1263   merge->C_seq         = PETSC_NULL;
1264 
1265   /* attach the supporting struct to B_mpi for reuse */
1266   ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
1267   ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr);
1268   ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr);
1269   *C = B_mpi;
1270 
1271   ierr = PetscFree(cdi);CHKERRQ(ierr);
1272   ierr = PetscFree(cdj);CHKERRQ(ierr);
1273 #ifdef NEW
1274   ierr = PetscFree(coi);CHKERRQ(ierr);
1275   ierr = PetscFree(coj);CHKERRQ(ierr);
1276 #endif
1277   PetscFunctionReturn(0);
1278 }
1279 
1280 #undef __FUNCT__
1281 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
1282 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
1283 {
1284   PetscErrorCode       ierr;
1285   Mat_Merge_SeqsToMPI  *merge;
1286   Mat_MatMatMultMPI    *ptap;
1287   PetscObjectContainer cont_merge,cont_ptap;
1288   PetscInt             flops=0;
1289   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1290   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1291   Mat_SeqAIJ           *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data,*p_oth;
1292   Mat_SeqAIJ           *cd=(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
1293   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense;
1294   PetscInt             *pi_oth,*pj_oth,*pJ_d=pd->j,*pJ_o=po->j,*pjj;
1295   PetscInt             i,j,k,nzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1296   MatScalar            *ada=ad->a,*aoa=ao->a,*apa,*paj,*cseqa,*caj,**abuf_r,*ba_i,*ca;
1297   MatScalar            *pA_d=pd->a,*pA_o=po->a,*pa_oth;
1298   PetscInt             am=A->m,cN=C->N,cm=C->m;
1299   MPI_Comm             comm=C->comm;
1300   PetscMPIInt          size,rank,taga,*len_s,**buf_ri,**buf_rj,**buf_ri_k,**nextrow,**nextcseqi;
1301   PetscInt             *owners,proc,nrows;
1302   PetscInt             *cjj,cseqnzi,*bj_i,*bi,*bj,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */
1303   MPI_Request          *s_waits,*r_waits;
1304   MPI_Status           *status;
1305   PetscInt             *cseqi,*cseqj,col;
1306   PetscInt             stages[3];
1307 
1308   PetscFunctionBegin;
1309   ierr = PetscLogStageRegister(&stages[0],"NumAP_local");CHKERRQ(ierr);
1310   ierr = PetscLogStageRegister(&stages[1],"NumPtAP_local");CHKERRQ(ierr);
1311   ierr = PetscLogStageRegister(&stages[2],"NumPtAP_Comm");CHKERRQ(ierr);
1312 
1313   ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr);
1314   if (cont_merge) {
1315     ierr  = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr);
1316   } else {
1317     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
1318   }
1319   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr);
1320   if (cont_ptap) {
1321     ierr  = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr);
1322   } else {
1323     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
1324   }
1325   /* get data from symbolic products */
1326   p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data;
1327   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
1328 
1329   cseqi = merge->ci; cseqj=merge->cj;
1330   ierr  = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr);
1331   ierr  = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
1332 
1333   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1334   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1335   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
1336 
1337   /* Allocate temporary array for storage of one row of A*P and one row of C */
1338   ierr = PetscMalloc(2*cN*(sizeof(MatScalar)+sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1339   ierr = PetscMemzero(apa,2*cN*(sizeof(MatScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
1340   ca       = (MatScalar*)(apa + cN);
1341   apj      = (PetscInt *)(ca + cN);
1342   apjdense = (PetscInt *)(apj + cN);
1343 
1344   /* Clear old values in C */
1345   ierr = PetscMemzero(cd->a,cd->i[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1346   ierr = PetscMemzero(co->a,co->i[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1347 
1348   for (i=0;i<am;i++) {
1349     ierr = PetscLogStagePush(stages[0]);CHKERRQ(ierr);
1350     /* Form i-th sparse row of A*P */
1351      apnzj = 0;
1352     /* diagonal portion of A */
1353     nzi  = adi[i+1] - adi[i];
1354     for (j=0;j<nzi;j++) {
1355       prow = *adj++;
1356       /* diagonal portion of P */
1357       pnzj = pd->i[prow+1] - pd->i[prow];
1358       pjj  = pd->j + pd->i[prow]; /* local col index of P */
1359       paj  = pd->a + pd->i[prow];
1360       for (k=0;k<pnzj;k++) {
1361         col = *pjj + p->cstart; pjj++; /* global col index of P */
1362         if (!apjdense[col]) {
1363           apjdense[col] = -1;
1364           apj[apnzj++]  = col;
1365         }
1366         apa[col] += (*ada)*paj[k];
1367       }
1368       flops += 2*pnzj;
1369       /* off-diagonal portion of P */
1370       pnzj = po->i[prow+1] - po->i[prow];
1371       pjj  = po->j + po->i[prow]; /* local col index of P */
1372       paj  = po->a + po->i[prow];
1373       for (k=0;k<pnzj;k++) {
1374         col = p->garray[*pjj]; pjj++; /* global col index of P */
1375         if (!apjdense[col]) {
1376           apjdense[col] = -1;
1377           apj[apnzj++]  = col;
1378         }
1379         apa[col] += (*ada)*paj[k];
1380       }
1381       flops += 2*pnzj;
1382 
1383       ada++;
1384     }
1385     /* off-diagonal portion of A */
1386     nzi  = aoi[i+1] - aoi[i];
1387     for (j=0;j<nzi;j++) {
1388       prow = *aoj++;
1389       pnzj = pi_oth[prow+1] - pi_oth[prow];
1390       pjj  = pj_oth + pi_oth[prow];
1391       paj  = pa_oth + pi_oth[prow];
1392       for (k=0;k<pnzj;k++) {
1393         if (!apjdense[pjj[k]]) {
1394           apjdense[pjj[k]] = -1;
1395           apj[apnzj++]     = pjj[k];
1396         }
1397         apa[pjj[k]] += (*aoa)*paj[k];
1398       }
1399       flops += 2*pnzj;
1400       aoa++;
1401     }
1402     /* Sort the j index array for quick sparse axpy. */
1403     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1404 
1405     ierr = PetscLogStagePop();
1406     ierr = PetscLogStagePush(stages[1]);
1407     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
1408     /* diagonal portion of P -- gives matrix value of local C */
1409     pnzi = pd->i[i+1] - pd->i[i];
1410     for (j=0;j<pnzi;j++) {
1411       /* add value into C */
1412       for (k=0; k<apnzj; k++) ca[k] = (*pA_d)*apa[apj[k]];
1413       crow = (*pJ_d++) + owners[rank];
1414       ierr = MatSetValues(C,1,&crow,apnzj,apj,ca,ADD_VALUES);CHKERRQ(ierr);
1415       ierr = PetscMemzero(ca,apnzj*(sizeof(MatScalar)));CHKERRQ(ierr);
1416       pA_d++;
1417       flops += 2*apnzj;
1418     }
1419 
1420     /* off-diagonal portion of P -- gives matrix values to be sent to others */
1421     pnzi = po->i[i+1] - po->i[i];
1422     for (j=0;j<pnzi;j++) {
1423       crow   = p->garray[*pJ_o++];
1424       cjj    = cseqj + cseqi[crow];
1425       caj    = cseqa + cseqi[crow];
1426       /* add value into C_seq to be sent to other processors */
1427       nextap = 0;
1428       for (k=0;nextap<apnzj;k++) {
1429         if (cjj[k]==apj[nextap]) {
1430           caj[k] += (*pA_o)*apa[apj[nextap++]];
1431         }
1432       }
1433       flops += 2*apnzj;
1434       pA_o++;
1435     }
1436 
1437     /* Zero the current row info for A*P */
1438     for (j=0;j<apnzj;j++) {
1439       apa[apj[j]]      = 0.;
1440       apjdense[apj[j]] = 0;
1441     }
1442     ierr = PetscLogStagePop();
1443   }
1444 
1445   /* send and recv matrix values */
1446   /*-----------------------------*/
1447   ierr = PetscLogStagePush(stages[2]);CHKERRQ(ierr);
1448   bi     = merge->bi;
1449   bj     = merge->bj;
1450   buf_ri = merge->buf_ri;
1451   buf_rj = merge->buf_rj;
1452   len_s  = merge->len_s;
1453   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr);
1454   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1455 
1456   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr);
1457   for (proc=0,k=0; proc<size; proc++){
1458     if (!len_s[proc]) continue;
1459     i = owners[proc];
1460     ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1461     k++;
1462   }
1463   ierr   = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
1464   ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);
1465   ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);
1466   ierr = PetscFree(status);CHKERRQ(ierr);
1467 
1468   ierr = PetscFree(s_waits);CHKERRQ(ierr);
1469   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1470   ierr = PetscFree(cseqa);CHKERRQ(ierr);
1471 
1472   /* insert mat values of mpimat */
1473   /*----------------------------*/
1474   ba_i = apa; /* rename, temporary array for storage of one row of C */
1475   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
1476   nextrow = buf_ri_k + merge->nrecv;
1477   nextcseqi = nextrow + merge->nrecv;
1478 
1479   for (k=0; k<merge->nrecv; k++){
1480     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1481     nrows        = *(buf_ri_k[k]);
1482     nextrow[k]   = buf_ri_k[k]+1;  /* next row index of k-th recved i-structure */
1483     nextcseqi[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
1484   }
1485 
1486   /* add received local vals to C */
1487   for (i=0; i<cm; i++) {
1488     crow = owners[rank] + i;
1489     bj_i = bj+bi[i];  /* col indices of the i-th row of C */
1490     bnzi = bi[i+1] - bi[i];
1491     nzi = 0;
1492     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
1493       /* i-th row */
1494       if (i == *nextrow[k]) {
1495         cseqnzi = *(nextcseqi[k]+1) - *nextcseqi[k];
1496         cseqj   = buf_rj[k] + *(nextcseqi[k]);
1497         cseqa   = abuf_r[k] + *(nextcseqi[k]);
1498         nextcseqj = 0;
1499         for (j=0; nextcseqj<cseqnzi; j++){
1500           if (*(bj_i + j) == cseqj[nextcseqj]){ /* bcol == cseqcol */
1501             ba_i[j] += cseqa[nextcseqj++];
1502             nzi++;
1503           }
1504         }
1505         nextrow[k]++; nextcseqi[k]++;
1506       }
1507     }
1508     if (nzi>0){
1509       ierr = MatSetValues(C,1,&crow,bnzi,bj_i,ba_i,ADD_VALUES);CHKERRQ(ierr);
1510       ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr);
1511       flops += 2*nzi;
1512     }
1513   }
1514   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1515   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1516   ierr = PetscLogStagePop();CHKERRQ(ierr);
1517 
1518   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1519   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
1520   ierr = PetscFree(apa);CHKERRQ(ierr);
1521   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1522 
1523   PetscFunctionReturn(0);
1524 }
1525 #endif /* TOBEREMOVED */
1526 
1527 #undef __FUNCT__
1528 #define __FUNCT__ "MatPtAP_SeqAIJ_SeqAIJ"
1529 PetscErrorCode MatPtAP_SeqAIJ_SeqAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
1530 {
1531   PetscErrorCode ierr;
1532 
1533   PetscFunctionBegin;
1534   if (scall == MAT_INITIAL_MATRIX){
1535     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
1536     ierr = MatPtAPSymbolic_SeqAIJ_SeqAIJ(A,P,fill,C);CHKERRQ(ierr);
1537     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
1538   }
1539   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1540   ierr = MatPtAPNumeric_SeqAIJ_SeqAIJ(A,P,*C);CHKERRQ(ierr);
1541   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1542   PetscFunctionReturn(0);
1543 }
1544 
1545 /*
1546    MatPtAPSymbolic - Creates the (i,j) structure of the matrix projection C = P^T * A * P
1547 
1548    Collective on Mat
1549 
1550    Input Parameters:
1551 +  A - the matrix
1552 -  P - the projection matrix
1553 
1554    Output Parameters:
1555 .  C - the (i,j) structure of the product matrix
1556 
1557    Notes:
1558    C will be created and must be destroyed by the user with MatDestroy().
1559 
1560    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
1561    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.  The product is computed using
1562    this (i,j) structure by calling MatPtAPNumeric().
1563 
1564    Level: intermediate
1565 
1566 .seealso: MatPtAP(),MatPtAPNumeric(),MatMatMultSymbolic()
1567 */
1568 #undef __FUNCT__
1569 #define __FUNCT__ "MatPtAPSymbolic"
1570 PetscErrorCode MatPtAPSymbolic(Mat A,Mat P,PetscReal fill,Mat *C)
1571 {
1572   PetscErrorCode ierr;
1573   PetscErrorCode (*fA)(Mat,Mat,PetscReal,Mat*);
1574   PetscErrorCode (*fP)(Mat,Mat,PetscReal,Mat*);
1575 
1576   PetscFunctionBegin;
1577 
1578   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
1579   PetscValidType(A,1);
1580   MatPreallocated(A);
1581   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1582   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1583 
1584   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
1585   PetscValidType(P,2);
1586   MatPreallocated(P);
1587   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1588   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1589 
1590   PetscValidPointer(C,3);
1591 
1592   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
1593   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
1594 
1595   /* For now, we do not dispatch based on the type of A and P */
1596   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
1597   fA = A->ops->ptapsymbolic;
1598   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for A of type %s",A->type_name);
1599   fP = P->ops->ptapsymbolic;
1600   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for P of type %s",P->type_name);
1601   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPSymbolic requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
1602 
1603   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
1604   ierr = (*fA)(A,P,fill,C);CHKERRQ(ierr);
1605   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
1606 
1607   PetscFunctionReturn(0);
1608 }
1609 
1610 typedef struct {
1611   Mat    symAP;
1612 } Mat_PtAPstruct;
1613 
1614 EXTERN PetscErrorCode MatDestroy_SeqAIJ(Mat);
1615 
1616 #undef __FUNCT__
1617 #define __FUNCT__ "MatDestroy_SeqAIJ_PtAP"
1618 PetscErrorCode MatDestroy_SeqAIJ_PtAP(Mat A)
1619 {
1620   PetscErrorCode    ierr;
1621   Mat_PtAPstruct    *ptap=(Mat_PtAPstruct*)A->spptr;
1622 
1623   PetscFunctionBegin;
1624   ierr = MatDestroy(ptap->symAP);CHKERRQ(ierr);
1625   ierr = PetscFree(ptap);CHKERRQ(ierr);
1626   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1627   PetscFunctionReturn(0);
1628 }
1629 
1630 #undef __FUNCT__
1631 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqAIJ"
1632 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
1633 {
1634   PetscErrorCode ierr;
1635   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1636   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*p = (Mat_SeqAIJ*)P->data,*c;
1637   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
1638   PetscInt       *ci,*cj,*ptadenserow,*ptasparserow,*ptaj;
1639   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M;
1640   PetscInt       i,j,k,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi,nlnk,*lnk;
1641   MatScalar      *ca;
1642   PetscBT        lnkbt;
1643 
1644   PetscFunctionBegin;
1645   /* Get ij structure of P^T */
1646   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1647   ptJ=ptj;
1648 
1649   /* Allocate ci array, arrays for fill computation and */
1650   /* free space for accumulating nonzero column info */
1651   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1652   ci[0] = 0;
1653 
1654   ierr = PetscMalloc((2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
1655   ierr = PetscMemzero(ptadenserow,(2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
1656   ptasparserow = ptadenserow  + an;
1657 
1658   /* create and initialize a linked list */
1659   nlnk = pn+1;
1660   ierr = PetscLLCreate(pn,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1661 
1662   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
1663   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1664   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
1665   current_space = free_space;
1666 
1667   /* Determine symbolic info for each row of C: */
1668   for (i=0;i<pn;i++) {
1669     ptnzi  = pti[i+1] - pti[i];
1670     ptanzi = 0;
1671     /* Determine symbolic row of PtA: */
1672     for (j=0;j<ptnzi;j++) {
1673       arow = *ptJ++;
1674       anzj = ai[arow+1] - ai[arow];
1675       ajj  = aj + ai[arow];
1676       for (k=0;k<anzj;k++) {
1677         if (!ptadenserow[ajj[k]]) {
1678           ptadenserow[ajj[k]]    = -1;
1679           ptasparserow[ptanzi++] = ajj[k];
1680         }
1681       }
1682     }
1683       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
1684     ptaj = ptasparserow;
1685     cnzi   = 0;
1686     for (j=0;j<ptanzi;j++) {
1687       prow = *ptaj++;
1688       pnzj = pi[prow+1] - pi[prow];
1689       pjj  = pj + pi[prow];
1690       /* add non-zero cols of P into the sorted linked list lnk */
1691       ierr = PetscLLAdd(pnzj,pjj,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1692       cnzi += nlnk;
1693     }
1694 
1695     /* If free space is not available, make more free space */
1696     /* Double the amount of total space in the list */
1697     if (current_space->local_remaining<cnzi) {
1698       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1699     }
1700 
1701     /* Copy data into free space, and zero out denserows */
1702     ierr = PetscLLClean(pn,pn,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1703     current_space->array           += cnzi;
1704     current_space->local_used      += cnzi;
1705     current_space->local_remaining -= cnzi;
1706 
1707     for (j=0;j<ptanzi;j++) {
1708       ptadenserow[ptasparserow[j]] = 0;
1709     }
1710     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1711     /*        For now, we will recompute what is needed. */
1712     ci[i+1] = ci[i] + cnzi;
1713   }
1714   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1715   /* Allocate space for cj, initialize cj, and */
1716   /* destroy list of free space and other temporary array(s) */
1717   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1718   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1719   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
1720   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1721 
1722   /* Allocate space for ca */
1723   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1724   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1725 
1726   /* put together the new matrix */
1727   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
1728 
1729   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1730   /* Since these are PETSc arrays, change flags to free them as necessary. */
1731   c = (Mat_SeqAIJ *)((*C)->data);
1732   c->freedata = PETSC_TRUE;
1733   c->nonew    = 0;
1734 
1735   /* Clean up. */
1736   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1737 
1738   PetscFunctionReturn(0);
1739 }
1740 
1741 #include "src/mat/impls/maij/maij.h"
1742 EXTERN_C_BEGIN
1743 #undef __FUNCT__
1744 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqMAIJ"
1745 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqMAIJ(Mat A,Mat PP,Mat *C)
1746 {
1747   /* This routine requires testing -- I don't think it works. */
1748   PetscErrorCode ierr;
1749   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1750   Mat_SeqMAIJ    *pp=(Mat_SeqMAIJ*)PP->data;
1751   Mat            P=pp->AIJ;
1752   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*p=(Mat_SeqAIJ*)P->data,*c;
1753   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
1754   PetscInt       *ci,*cj,*denserow,*sparserow,*ptadenserow,*ptasparserow,*ptaj;
1755   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M,ppdof=pp->dof;
1756   PetscInt       i,j,k,dof,pdof,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi;
1757   MatScalar      *ca;
1758 
1759   PetscFunctionBegin;
1760   /* Start timer */
1761   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1762 
1763   /* Get ij structure of P^T */
1764   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1765 
1766   /* Allocate ci array, arrays for fill computation and */
1767   /* free space for accumulating nonzero column info */
1768   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1769   ci[0] = 0;
1770 
1771   ierr = PetscMalloc((2*pn+2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
1772   ierr = PetscMemzero(ptadenserow,(2*pn+2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
1773   ptasparserow = ptadenserow  + an;
1774   denserow     = ptasparserow + an;
1775   sparserow    = denserow     + pn;
1776 
1777   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
1778   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1779   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
1780   current_space = free_space;
1781 
1782   /* Determine symbolic info for each row of C: */
1783   for (i=0;i<pn/ppdof;i++) {
1784     ptnzi  = pti[i+1] - pti[i];
1785     ptanzi = 0;
1786     ptJ    = ptj + pti[i];
1787     for (dof=0;dof<ppdof;dof++) {
1788     /* Determine symbolic row of PtA: */
1789       for (j=0;j<ptnzi;j++) {
1790         arow = ptJ[j] + dof;
1791         anzj = ai[arow+1] - ai[arow];
1792         ajj  = aj + ai[arow];
1793         for (k=0;k<anzj;k++) {
1794           if (!ptadenserow[ajj[k]]) {
1795             ptadenserow[ajj[k]]    = -1;
1796             ptasparserow[ptanzi++] = ajj[k];
1797           }
1798         }
1799       }
1800       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
1801       ptaj = ptasparserow;
1802       cnzi   = 0;
1803       for (j=0;j<ptanzi;j++) {
1804         pdof = *ptaj%dof;
1805         prow = (*ptaj++)/dof;
1806         pnzj = pi[prow+1] - pi[prow];
1807         pjj  = pj + pi[prow];
1808         for (k=0;k<pnzj;k++) {
1809           if (!denserow[pjj[k]+pdof]) {
1810             denserow[pjj[k]+pdof] = -1;
1811             sparserow[cnzi++]     = pjj[k]+pdof;
1812           }
1813         }
1814       }
1815 
1816       /* sort sparserow */
1817       ierr = PetscSortInt(cnzi,sparserow);CHKERRQ(ierr);
1818 
1819       /* If free space is not available, make more free space */
1820       /* Double the amount of total space in the list */
1821       if (current_space->local_remaining<cnzi) {
1822         ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1823       }
1824 
1825       /* Copy data into free space, and zero out denserows */
1826       ierr = PetscMemcpy(current_space->array,sparserow,cnzi*sizeof(PetscInt));CHKERRQ(ierr);
1827       current_space->array           += cnzi;
1828       current_space->local_used      += cnzi;
1829       current_space->local_remaining -= cnzi;
1830 
1831       for (j=0;j<ptanzi;j++) {
1832         ptadenserow[ptasparserow[j]] = 0;
1833       }
1834       for (j=0;j<cnzi;j++) {
1835         denserow[sparserow[j]] = 0;
1836       }
1837       /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1838       /*        For now, we will recompute what is needed. */
1839       ci[i+1+dof] = ci[i+dof] + cnzi;
1840     }
1841   }
1842   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1843   /* Allocate space for cj, initialize cj, and */
1844   /* destroy list of free space and other temporary array(s) */
1845   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1846   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1847   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
1848 
1849   /* Allocate space for ca */
1850   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1851   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1852 
1853   /* put together the new matrix */
1854   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
1855 
1856   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1857   /* Since these are PETSc arrays, change flags to free them as necessary. */
1858   c = (Mat_SeqAIJ *)((*C)->data);
1859   c->freedata = PETSC_TRUE;
1860   c->nonew    = 0;
1861 
1862   /* Clean up. */
1863   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1864 
1865   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1866   PetscFunctionReturn(0);
1867 }
1868 EXTERN_C_END
1869 
1870 /*
1871    MatPtAPNumeric - Computes the matrix projection C = P^T * A * P
1872 
1873    Collective on Mat
1874 
1875    Input Parameters:
1876 +  A - the matrix
1877 -  P - the projection matrix
1878 
1879    Output Parameters:
1880 .  C - the product matrix
1881 
1882    Notes:
1883    C must have been created by calling MatPtAPSymbolic and must be destroyed by
1884    the user using MatDeatroy().
1885 
1886    This routine is currently only implemented for pairs of AIJ matrices and classes
1887    which inherit from AIJ.  C will be of type MATAIJ.
1888 
1889    Level: intermediate
1890 
1891 .seealso: MatPtAP(),MatPtAPSymbolic(),MatMatMultNumeric()
1892 */
1893 #undef __FUNCT__
1894 #define __FUNCT__ "MatPtAPNumeric"
1895 PetscErrorCode MatPtAPNumeric(Mat A,Mat P,Mat C)
1896 {
1897   PetscErrorCode ierr;
1898   PetscErrorCode (*fA)(Mat,Mat,Mat);
1899   PetscErrorCode (*fP)(Mat,Mat,Mat);
1900 
1901   PetscFunctionBegin;
1902 
1903   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
1904   PetscValidType(A,1);
1905   MatPreallocated(A);
1906   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1907   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1908 
1909   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
1910   PetscValidType(P,2);
1911   MatPreallocated(P);
1912   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1913   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1914 
1915   PetscValidHeaderSpecific(C,MAT_COOKIE,3);
1916   PetscValidType(C,3);
1917   MatPreallocated(C);
1918   if (!C->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1919   if (C->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1920 
1921   if (P->N!=C->M) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->M);
1922   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
1923   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
1924   if (P->N!=C->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->N);
1925 
1926   /* For now, we do not dispatch based on the type of A and P */
1927   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
1928   fA = A->ops->ptapnumeric;
1929   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for A of type %s",A->type_name);
1930   fP = P->ops->ptapnumeric;
1931   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for P of type %s",P->type_name);
1932   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPNumeric requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
1933 
1934   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1935   ierr = (*fA)(A,P,C);CHKERRQ(ierr);
1936   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1937 
1938   PetscFunctionReturn(0);
1939 }
1940 
1941 #undef __FUNCT__
1942 #define __FUNCT__ "MatPtAPNumeric_SeqAIJ_SeqAIJ"
1943 PetscErrorCode MatPtAPNumeric_SeqAIJ_SeqAIJ(Mat A,Mat P,Mat C)
1944 {
1945   PetscErrorCode ierr;
1946   PetscInt       flops=0;
1947   Mat_SeqAIJ     *a  = (Mat_SeqAIJ *) A->data;
1948   Mat_SeqAIJ     *p  = (Mat_SeqAIJ *) P->data;
1949   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
1950   PetscInt       *ai=a->i,*aj=a->j,*apj,*apjdense,*pi=p->i,*pj=p->j,*pJ=p->j,*pjj;
1951   PetscInt       *ci=c->i,*cj=c->j,*cjj;
1952   PetscInt       am=A->M,cn=C->N,cm=C->M;
1953   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1954   MatScalar      *aa=a->a,*apa,*pa=p->a,*pA=p->a,*paj,*ca=c->a,*caj;
1955 
1956   PetscFunctionBegin;
1957   /* Allocate temporary array for storage of one row of A*P */
1958   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1959   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
1960 
1961   apj      = (PetscInt *)(apa + cn);
1962   apjdense = apj + cn;
1963 
1964   /* Clear old values in C */
1965   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1966 
1967   for (i=0;i<am;i++) {
1968     /* Form sparse row of A*P */
1969     anzi  = ai[i+1] - ai[i];
1970     apnzj = 0;
1971     for (j=0;j<anzi;j++) {
1972       prow = *aj++;
1973       pnzj = pi[prow+1] - pi[prow];
1974       pjj  = pj + pi[prow];
1975       paj  = pa + pi[prow];
1976       for (k=0;k<pnzj;k++) {
1977         if (!apjdense[pjj[k]]) {
1978           apjdense[pjj[k]] = -1;
1979           apj[apnzj++]     = pjj[k];
1980         }
1981         apa[pjj[k]] += (*aa)*paj[k];
1982       }
1983       flops += 2*pnzj;
1984       aa++;
1985     }
1986 
1987     /* Sort the j index array for quick sparse axpy. */
1988     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1989 
1990     /* Compute P^T*A*P using outer product (P^T)[:,j]*(A*P)[j,:]. */
1991     pnzi = pi[i+1] - pi[i];
1992     for (j=0;j<pnzi;j++) {
1993       nextap = 0;
1994       crow   = *pJ++;
1995       cjj    = cj + ci[crow];
1996       caj    = ca + ci[crow];
1997       /* Perform sparse axpy operation.  Note cjj includes apj. */
1998       for (k=0;nextap<apnzj;k++) {
1999         if (cjj[k]==apj[nextap]) {
2000           caj[k] += (*pA)*apa[apj[nextap++]];
2001         }
2002       }
2003       flops += 2*apnzj;
2004       pA++;
2005     }
2006 
2007     /* Zero the current row info for A*P */
2008     for (j=0;j<apnzj;j++) {
2009       apa[apj[j]]      = 0.;
2010       apjdense[apj[j]] = 0;
2011     }
2012   }
2013 
2014   /* Assemble the final matrix and clean up */
2015   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2016   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2017   ierr = PetscFree(apa);CHKERRQ(ierr);
2018   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
2019 
2020   PetscFunctionReturn(0);
2021 }
2022 
2023 /* Compute local C = P_loc^T * A * P - used by MatPtAP_MPIAIJ_MPIAIJ() */
2024 static PetscEvent logkey_matptapnumeric_local = 0;
2025 #undef __FUNCT__
2026 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ_Local"
2027 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,Mat C)
2028 {
2029   PetscErrorCode ierr;
2030   PetscInt       flops=0;
2031   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
2032   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
2033   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
2034   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
2035   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
2036   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col;
2037   PetscInt       *pJ=pj_loc,*pjj;
2038   PetscInt       *ci=c->i,*cj=c->j,*cjj;
2039   PetscInt       am=A->m,cn=C->N,cm=C->M;
2040   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
2041   MatScalar      *ada=ad->a,*aoa=ao->a,*apa,*paj,*ca=c->a,*caj;
2042   MatScalar      *pa_loc=p_loc->a,*pA=pa_loc,*pa_oth=p_oth->a;
2043 
2044   PetscFunctionBegin;
2045   if (!logkey_matptapnumeric_local) {
2046     ierr = PetscLogEventRegister(&logkey_matptapnumeric_local,"MatPtAPNumeric_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);CHKERRQ(ierr);
2047   }
2048   ierr = PetscLogEventBegin(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
2049 
2050   /* Allocate temporary array for storage of one row of A*P */
2051   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
2052   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
2053   apj      = (PetscInt *)(apa + cn);
2054   apjdense = apj + cn;
2055 
2056   /* Clear old values in C */
2057   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
2058 
2059   for (i=0;i<am;i++) {
2060     /* Form i-th sparse row of A*P */
2061      apnzj = 0;
2062     /* diagonal portion of A */
2063     anzi  = adi[i+1] - adi[i];
2064     for (j=0;j<anzi;j++) {
2065       prow = *adj;
2066       adj++;
2067       pnzj = pi_loc[prow+1] - pi_loc[prow];
2068       pjj  = pj_loc + pi_loc[prow];
2069       paj  = pa_loc + pi_loc[prow];
2070       for (k=0;k<pnzj;k++) {
2071         if (!apjdense[pjj[k]]) {
2072           apjdense[pjj[k]] = -1;
2073           apj[apnzj++]     = pjj[k];
2074         }
2075         apa[pjj[k]] += (*ada)*paj[k];
2076       }
2077       flops += 2*pnzj;
2078       ada++;
2079     }
2080     /* off-diagonal portion of A */
2081     anzi  = aoi[i+1] - aoi[i];
2082     for (j=0;j<anzi;j++) {
2083       col = a->garray[*aoj];
2084       if (col < cstart){
2085         prow = *aoj;
2086       } else if (col >= cend){
2087         prow = *aoj;
2088       } else {
2089         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
2090       }
2091       aoj++;
2092       pnzj = pi_oth[prow+1] - pi_oth[prow];
2093       pjj  = pj_oth + pi_oth[prow];
2094       paj  = pa_oth + pi_oth[prow];
2095       for (k=0;k<pnzj;k++) {
2096         if (!apjdense[pjj[k]]) {
2097           apjdense[pjj[k]] = -1;
2098           apj[apnzj++]     = pjj[k];
2099         }
2100         apa[pjj[k]] += (*aoa)*paj[k];
2101       }
2102       flops += 2*pnzj;
2103       aoa++;
2104     }
2105     /* Sort the j index array for quick sparse axpy. */
2106     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
2107 
2108     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
2109     pnzi = pi_loc[i+1] - pi_loc[i];
2110     for (j=0;j<pnzi;j++) {
2111       nextap = 0;
2112       crow   = *pJ++;
2113       cjj    = cj + ci[crow];
2114       caj    = ca + ci[crow];
2115       /* Perform sparse axpy operation.  Note cjj includes apj. */
2116       for (k=0;nextap<apnzj;k++) {
2117         if (cjj[k]==apj[nextap]) {
2118           caj[k] += (*pA)*apa[apj[nextap++]];
2119         }
2120       }
2121       flops += 2*apnzj;
2122       pA++;
2123     }
2124 
2125     /* Zero the current row info for A*P */
2126     for (j=0;j<apnzj;j++) {
2127       apa[apj[j]]      = 0.;
2128       apjdense[apj[j]] = 0;
2129     }
2130   }
2131 
2132   /* Assemble the final matrix and clean up */
2133   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2134   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2135   ierr = PetscFree(apa);CHKERRQ(ierr);
2136   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
2137   ierr = PetscLogEventEnd(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
2138   PetscFunctionReturn(0);
2139 }
2140 static PetscEvent logkey_matptapsymbolic_local = 0;
2141 #undef __FUNCT__
2142 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local"
2143 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,PetscReal fill,Mat *C)
2144 {
2145   PetscErrorCode ierr;
2146   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
2147   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
2148   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*c;
2149   PetscInt       *pti,*ptj,*ptJ,*ajj,*pjj;
2150   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
2151   PetscInt       *ci,*cj,*ptaj;
2152   PetscInt       aN=A->N,am=A->m,pN=P_loc->N;
2153   PetscInt       i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi;
2154   PetscInt       pm=P_loc->m,nlnk,*lnk;
2155   MatScalar      *ca;
2156   PetscBT        lnkbt;
2157   PetscInt       prend,nprow_loc,nprow_oth;
2158   PetscInt       *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
2159   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
2160   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
2161 
2162   PetscFunctionBegin;
2163   if (!logkey_matptapsymbolic_local) {
2164     ierr = PetscLogEventRegister(&logkey_matptapsymbolic_local,"MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);
2165   }
2166   ierr = PetscLogEventBegin(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
2167 
2168   prend = prstart + pm;
2169 
2170   /* get ij structure of P_loc^T */
2171   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
2172   ptJ=ptj;
2173 
2174   /* Allocate ci array, arrays for fill computation and */
2175   /* free space for accumulating nonzero column info */
2176   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
2177   ci[0] = 0;
2178 
2179   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
2180   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
2181   ptasparserow_loc = ptadenserow_loc + aN;
2182   ptadenserow_oth  = ptasparserow_loc + aN;
2183   ptasparserow_oth = ptadenserow_oth + aN;
2184 
2185   /* create and initialize a linked list */
2186   nlnk = pN+1;
2187   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
2188 
2189   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
2190   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
2191   nnz           = adi[am] + aoi[am];
2192   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
2193   current_space = free_space;
2194 
2195   /* determine symbolic info for each row of C: */
2196   for (i=0;i<pN;i++) {
2197     ptnzi  = pti[i+1] - pti[i];
2198     nprow_loc = 0; nprow_oth = 0;
2199     /* i-th row of symbolic P_loc^T*A_loc: */
2200     for (j=0;j<ptnzi;j++) {
2201       arow = *ptJ++;
2202       /* diagonal portion of A */
2203       anzj = adi[arow+1] - adi[arow];
2204       ajj  = adj + adi[arow];
2205       for (k=0;k<anzj;k++) {
2206         col = ajj[k]+prstart;
2207         if (!ptadenserow_loc[col]) {
2208           ptadenserow_loc[col]    = -1;
2209           ptasparserow_loc[nprow_loc++] = col;
2210         }
2211       }
2212       /* off-diagonal portion of A */
2213       anzj = aoi[arow+1] - aoi[arow];
2214       ajj  = aoj + aoi[arow];
2215       for (k=0;k<anzj;k++) {
2216         col = a->garray[ajj[k]];  /* global col */
2217         if (col < cstart){
2218           col = ajj[k];
2219         } else if (col >= cend){
2220           col = ajj[k] + pm;
2221         } else {
2222           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
2223         }
2224         if (!ptadenserow_oth[col]) {
2225           ptadenserow_oth[col]    = -1;
2226           ptasparserow_oth[nprow_oth++] = col;
2227         }
2228       }
2229     }
2230 
2231     /* using symbolic info of local PtA, determine symbolic info for row of C: */
2232     cnzi   = 0;
2233     /* rows of P_loc */
2234     ptaj = ptasparserow_loc;
2235     for (j=0; j<nprow_loc; j++) {
2236       prow = *ptaj++;
2237       prow -= prstart; /* rm */
2238       pnzj = pi_loc[prow+1] - pi_loc[prow];
2239       pjj  = pj_loc + pi_loc[prow];
2240       /* add non-zero cols of P into the sorted linked list lnk */
2241       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
2242       cnzi += nlnk;
2243     }
2244     /* rows of P_oth */
2245     ptaj = ptasparserow_oth;
2246     for (j=0; j<nprow_oth; j++) {
2247       prow = *ptaj++;
2248       if (prow >= prend) prow -= pm; /* rm */
2249       pnzj = pi_oth[prow+1] - pi_oth[prow];
2250       pjj  = pj_oth + pi_oth[prow];
2251       /* add non-zero cols of P into the sorted linked list lnk */
2252       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
2253       cnzi += nlnk;
2254     }
2255 
2256     /* If free space is not available, make more free space */
2257     /* Double the amount of total space in the list */
2258     if (current_space->local_remaining<cnzi) {
2259       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
2260     }
2261 
2262     /* Copy data into free space, and zero out denserows */
2263     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
2264     current_space->array           += cnzi;
2265     current_space->local_used      += cnzi;
2266     current_space->local_remaining -= cnzi;
2267 
2268     for (j=0;j<nprow_loc; j++) {
2269       ptadenserow_loc[ptasparserow_loc[j]] = 0;
2270     }
2271     for (j=0;j<nprow_oth; j++) {
2272       ptadenserow_oth[ptasparserow_oth[j]] = 0;
2273     }
2274 
2275     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
2276     /*        For now, we will recompute what is needed. */
2277     ci[i+1] = ci[i] + cnzi;
2278   }
2279   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
2280   /* Allocate space for cj, initialize cj, and */
2281   /* destroy list of free space and other temporary array(s) */
2282   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
2283   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
2284   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
2285   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2286 
2287   /* Allocate space for ca */
2288   ierr = PetscMalloc((ci[pN]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
2289   ierr = PetscMemzero(ca,(ci[pN]+1)*sizeof(MatScalar));CHKERRQ(ierr);
2290 
2291   /* put together the new matrix */
2292   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,pN,pN,ci,cj,ca,C);CHKERRQ(ierr);
2293 
2294   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
2295   /* Since these are PETSc arrays, change flags to free them as necessary. */
2296   c = (Mat_SeqAIJ *)((*C)->data);
2297   c->freedata = PETSC_TRUE;
2298   c->nonew    = 0;
2299 
2300   /* Clean up. */
2301   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
2302 
2303   ierr = PetscLogEventEnd(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
2304   PetscFunctionReturn(0);
2305 }
2306