xref: /petsc/src/mat/impls/aij/seq/matptap.c (revision c1683ab29e9cef5d711100819eed78d0c8de7f4d)
1 /*
2   Defines projective product routines where A is a AIJ matrix
3           C = P^T * A * P
4 */
5 
6 #include "src/mat/impls/aij/seq/aij.h"   /*I "petscmat.h" I*/
7 #include "src/mat/utils/freespace.h"
8 #include "src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatPtAP"
13 /*@
14    MatPtAP - Creates the matrix projection C = P^T * A * P
15 
16    Collective on Mat
17 
18    Input Parameters:
19 +  A - the matrix
20 .  P - the projection matrix
21 .  scall - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
22 -  fill - expected fill as ratio of nnz(C)/(nnz(A) + nnz(P))
23 
24    Output Parameters:
25 .  C - the product matrix
26 
27    Notes:
28    C will be created and must be destroyed by the user with MatDestroy().
29 
30    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
31    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.
32 
33    Level: intermediate
34 
35 .seealso: MatPtAPSymbolic(),MatPtAPNumeric(),MatMatMult()
36 @*/
37 PetscErrorCode MatPtAP(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
38 {
39   PetscErrorCode ierr;
40   PetscErrorCode (*fA)(Mat,Mat,MatReuse,PetscReal,Mat *);
41   PetscErrorCode (*fP)(Mat,Mat,MatReuse,PetscReal,Mat *);
42 
43   PetscFunctionBegin;
44   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
45   PetscValidType(A,1);
46   MatPreallocated(A);
47   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
48   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
49   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
50   PetscValidType(P,2);
51   MatPreallocated(P);
52   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
53   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
54   PetscValidPointer(C,3);
55 
56   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
57 
58   if (fill <=0.0) SETERRQ1(PETSC_ERR_ARG_SIZ,"fill=%g must be > 0.0",fill);
59 
60   /* For now, we do not dispatch based on the type of A and P */
61   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
62   fA = A->ops->ptap;
63   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for A of type %s",A->type_name);
64   fP = P->ops->ptap;
65   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for P of type %s",P->type_name);
66   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAP requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
67 
68   ierr = PetscLogEventBegin(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
69   ierr = (*fA)(A,P,scall,fill,C);CHKERRQ(ierr);
70   ierr = PetscLogEventEnd(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
71   PetscFunctionReturn(0);
72 }
73 
74 EXTERN PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,PetscReal,Mat*);
75 EXTERN PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,Mat C);
76 
77 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
78 #undef __FUNCT__
79 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
80 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
81 {
82   PetscErrorCode       ierr;
83   Mat_MatMatMultMPI    *ptap;
84   PetscObjectContainer container;
85 
86   PetscFunctionBegin;
87   ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
88   if (container) {
89     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
90     ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr);
91     ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr);
92     ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr);
93     ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr);
94 
95     ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
96     ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr);
97   }
98   ierr = PetscFree(ptap);CHKERRQ(ierr);
99 
100   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
101   PetscFunctionReturn(0);
102 }
103 
104 #undef __FUNCT__
105 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
106 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
107 {
108   PetscErrorCode       ierr;
109   Mat_MatMatMultMPI    *ptap;
110   PetscObjectContainer container;
111 
112   PetscFunctionBegin;
113   if (scall == MAT_INITIAL_MATRIX){
114     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
115     ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr);
116 
117     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
118     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
119 
120     /* get P_loc by taking all local rows of P */
121     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
122 
123     /* attach the supporting struct to P for reuse */
124     P->ops->destroy  = MatDestroy_MPIAIJ_PtAP;
125     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
126     ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr);
127     ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr);
128 
129     /* now, compute symbolic local P^T*A*P */
130     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */
131     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
132   } else if (scall == MAT_REUSE_MATRIX){
133     ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
134     if (container) {
135       ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
136     } else {
137       SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
138     }
139     /* update P_oth */
140     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
141 
142   } else {
143     SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %D",scall);
144   }
145   /* now, compute numeric local P^T*A*P */
146   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
147   ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
148   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
149 
150   PetscFunctionReturn(0);
151 }
152 /* Set symbolic info for i-th row of local product C=P^T*A*P */
153 #define MatPtAPSymbolicLocal_Private(ptM,pti,ptj,ci,cj) 0;\
154 {\
155   /* allocate ci array, arrays for fill computation and */\
156   /* free space for accumulating nonzero column info */\
157   ierr = PetscMalloc((ptM+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);\
158   ci[0] = 0;\
159 \
160   /* set initial free space to be nnz(A) scaled by fill*P->N/PtM. */\
161   /* this should be reasonable if sparsity of PtAP is similar to that of A. */\
162   nnz           = adi[am] + aoi[am];\
163   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/ptM+1),&free_space);\
164   current_space = free_space;\
165 \
166   ptJ=ptj;\
167   for (i=0; i<ptM; i++) {\
168     ptnzi  = pti[i+1] - pti[i];\
169     nprow_loc = 0; nprow_oth = 0;\
170     /* i-th row of symbolic Pt*A: */\
171     for (j=0;j<ptnzi;j++) {\
172       arow = *ptJ++;\
173       /* diagonal portion of A */\
174       anzj = adi[arow+1] - adi[arow];\
175       ajj  = adj + adi[arow];\
176       for (k=0;k<anzj;k++) {\
177         col = ajj[k]+prstart;\
178         if (!ptadenserow_loc[col]) {\
179           ptadenserow_loc[col]    = -1;\
180           ptasparserow_loc[nprow_loc++] = col;\
181         }\
182       }\
183       /* off-diagonal portion of A */\
184       anzj = aoi[arow+1] - aoi[arow];\
185       ajj  = aoj + aoi[arow];\
186       for (k=0;k<anzj;k++) {\
187         col = a->garray[ajj[k]];  /* global col */\
188         if (col < cstart){\
189           col = ajj[k];\
190         } else if (col >= cend){\
191           col = ajj[k] + pm;\
192         } else {\
193           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");\
194         }\
195         if (!ptadenserow_oth[col]) {\
196           ptadenserow_oth[col]    = -1;\
197           ptasparserow_oth[nprow_oth++] = col;\
198         }\
199       }\
200     }\
201     /* determine symbolic info for row of C_seq: */\
202     cnzi   = 0;\
203     /* rows of P_loc */\
204     ptaj = ptasparserow_loc;\
205     for (j=0; j<nprow_loc; j++) {\
206       prow = *ptaj++; \
207       prow -= prstart; /* rm */\
208       pnzj = pi_loc[prow+1] - pi_loc[prow];\
209       pjj  = pj_loc + pi_loc[prow];\
210       /* add non-zero cols of P into the sorted linked list lnk */\
211       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);\
212       cnzi += nlnk;\
213     }\
214     /* rows of P_oth */\
215     ptaj = ptasparserow_oth;\
216     for (j=0; j<nprow_oth; j++) {\
217       prow = *ptaj++; \
218       if (prow >= prend) prow -= pm; /* rm */\
219       pnzj = pi_oth[prow+1] - pi_oth[prow];\
220       pjj  = pj_oth + pi_oth[prow];\
221       /* add non-zero cols of P into the sorted linked list lnk */\
222       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);\
223       cnzi += nlnk;\
224     }\
225    \
226     /* If free space is not available, make more free space */\
227     /* Double the amount of total space in the list */\
228     if (current_space->local_remaining<cnzi) {\
229       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);\
230     }\
231 \
232     /* Copy data into free space, and zero out denserows */\
233     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);\
234     current_space->array           += cnzi;\
235     current_space->local_used      += cnzi;\
236     current_space->local_remaining -= cnzi;\
237    \
238     for (j=0;j<nprow_loc; j++) {\
239       ptadenserow_loc[ptasparserow_loc[j]] = 0;\
240     }\
241     for (j=0;j<nprow_oth; j++) {\
242       ptadenserow_oth[ptasparserow_oth[j]] = 0;\
243     }\
244     \
245     /* Aside: Perhaps we should save the pta info for the numerical factorization. */\
246     /*        For now, we will recompute what is needed. */ \
247     ci[i+1] = ci[i] + cnzi;\
248   }\
249   /* nnz is now stored in ci[ptm], column indices are in the list of free space */\
250   /* Allocate space for cj, initialize cj, and */\
251   /* destroy list of free space and other temporary array(s) */\
252   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);\
253   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);\
254 }
255 #undef __FUNCT__
256 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
257 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
258 {
259   PetscErrorCode       ierr;
260   Mat                  P_loc,P_oth;
261   Mat_MatMatMultMPI    *ptap;
262   PetscObjectContainer container;
263   FreeSpaceList        free_space=PETSC_NULL,current_space;
264   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
265   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*c,
266                        *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
267   Mat_SeqAIJ           *p_loc,*p_oth;
268   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*ajj,*pjj,*rmap=p->garray;
269   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
270   PetscInt             pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji,*ptaj;
271   PetscInt             aN=A->N,am=A->m,pN=P->N;
272   PetscInt             i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi,row;
273   MatScalar            *ca;
274   PetscBT              lnkbt;
275   PetscInt             prstart,prend,nprow_loc,nprow_oth;
276   PetscInt             *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
277 
278   MPI_Comm             comm=A->comm;
279   Mat                  B_mpi;
280   PetscMPIInt          size,rank,tagi,tagj,*len_s,*len_si,*len_ri;
281   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
282   PetscInt             len,proc,*dnz,*onz,*owners;
283   PetscInt             anzi,*bi,*bj,bnzi,nspacedouble=0;
284   PetscInt             nrows,tnrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
285   MPI_Request          *si_waits,*sj_waits,*ri_waits,*rj_waits;
286   MPI_Status           *status;
287   Mat_Merge_SeqsToMPI  *merge;
288 
289   PetscFunctionBegin;
290   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
291   if (container) {
292     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
293   } else {
294     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
295   }
296 
297   /* determine row ownership of C */
298   /*---------------------------------------------------------*/
299   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
300   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
301   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
302 
303   ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr);
304   ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr);
305   ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr);
306   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
307 
308   /* get data from P_loc and P_oth */
309   P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart;
310   p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
311   pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
312   prend = prstart + pm;
313 
314   /* compute symbolic C_seq = P_loc^T * A_loc * P_seq */
315   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
316   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
317   ptasparserow_loc = ptadenserow_loc + aN;
318   ptadenserow_oth  = ptasparserow_loc + aN;
319   ptasparserow_oth = ptadenserow_oth + aN;
320 
321   /* create and initialize a linked list */
322   nlnk = pN+1;
323   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
324 
325   /* Pt = P_loc^T */
326   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
327   ierr = MatPtAPSymbolicLocal_Private(pN,pti,ptj,ci,cj);CHKERRQ(ierr);
328 
329   /* clean up */
330   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
331   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
332   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
333 
334   /* determine the number of messages to send, their lengths */
335   /*---------------------------------------------------------*/
336   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
337   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
338   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
339   len_s  = merge->len_s;
340   len = 0;  /* max length of buf_si[] */
341   merge->nsend = 0;
342   tnrows = (p->B)->N; /* total num of rows to be sent to other processors */
343   proc = 0;
344   for (i=0; i<tnrows; i++){
345     while (rmap[i] >= owners[proc+1]) proc++;
346     len_si[proc]++;
347   }
348   for (proc=0; proc<size; proc++){
349     len_s[proc] = 0;
350     if (len_si[proc]){
351       merge->nsend++;
352       len_si[proc] = 2*(len_si[proc] + 1);
353       len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of col indices to be sent to [proc] */
354       len += len_si[proc];
355     }
356   }
357 
358   /* determine the number and length of messages to receive for ij-structure */
359   /*-------------------------------------------------------------------------*/
360   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
361   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
362 
363   /* post the Irecv of j-structure */
364   /*-------------------------------*/
365   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr);
366   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr);
367 
368   /* post the Isend of j-structure */
369   /*--------------------------------*/
370   ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr);
371   sj_waits = si_waits + merge->nsend;
372 
373   for (proc=0, k=0; proc<size; proc++){
374     if (!len_s[proc]) continue;
375     i = owners[proc];
376     ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr);
377     k++;
378   }
379 
380   /* receives and sends of j-structure are complete */
381   /*------------------------------------------------*/
382   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
383   ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr);
384   ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr);
385 
386   /* send and recv i-structure */
387   /*---------------------------*/
388   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr);
389   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr);
390 
391   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
392   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
393 
394   j = 0; /* row index to be sent */
395   for (proc=0,k=0; proc<size; proc++){
396     if (!len_s[proc]) continue;
397     /* form outgoing message for i-structure:
398          buf_si[0]:                 nrows to be sent
399                [1:nrows]:           row index (global)
400                [nrows+1:2*nrows+1]: i-structure index
401     */
402     /*-------------------------------------------*/
403     nrows = len_si[proc]/2 - 1;
404     buf_si_i    = buf_si + nrows+1;
405     buf_si[0]   = nrows;
406     buf_si_i[0] = 0;
407     for (i=0; i<nrows; i++){
408       row = rmap[j++];
409       anzi = ci[row+1] - ci[row];
410       buf_si_i[i+1] = buf_si_i[i] + anzi; /* i-structure */
411       buf_si[i+1]   = row - owners[proc]; /* local row index */
412     }
413     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr);
414     k++;
415     buf_si += len_si[proc];
416   }
417 
418   ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr);
419   ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr);
420 
421   ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr);
422   for (i=0; i<merge->nrecv; i++){
423     ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI:   recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr);
424   }
425 
426   ierr = PetscFree(len_si);CHKERRQ(ierr);
427   ierr = PetscFree(len_ri);CHKERRQ(ierr);
428   ierr = PetscFree(rj_waits);CHKERRQ(ierr);
429   ierr = PetscFree(si_waits);CHKERRQ(ierr);
430   ierr = PetscFree(ri_waits);CHKERRQ(ierr);
431   ierr = PetscFree(buf_s);CHKERRQ(ierr);
432   ierr = PetscFree(status);CHKERRQ(ierr);
433 
434   /* compute a local seq matrix in each processor */
435   /*----------------------------------------------*/
436   /* allocate bi array and free space for accumulating nonzero column info */
437   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
438   bi[0] = 0;
439 
440   /* create and initialize a linked list */
441   nlnk = pN+1;
442   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
443 
444   /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */
445   len = 0;
446   len  = ci[owners[rank+1]] - ci[owners[rank]];
447   free_space=PETSC_NULL;
448   ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr);
449   current_space = free_space;
450 
451   /* determine symbolic info for each local row */
452   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
453   nextrow = buf_ri_k + merge->nrecv;
454   nextci  = nextrow + merge->nrecv;
455   for (k=0; k<merge->nrecv; k++){
456     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
457     nrows = *buf_ri_k[k];
458     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
459     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
460   }
461 
462   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
463   len = 0;
464   for (i=0; i<pn; i++) {
465     bnzi   = 0;
466     /* add local non-zero cols of this proc's C_seq into lnk */
467     arow   = owners[rank] + i;
468     anzi   = ci[arow+1] - ci[arow];
469     cji    = cj + ci[arow];
470     ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
471     bnzi += nlnk;
472     /* add received col data into lnk */
473     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
474       if (i == *nextrow[k]) { /* i-th row */
475         anzi = *(nextci[k]+1) - *nextci[k];
476         cji  = buf_rj[k] + *nextci[k];
477         ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
478         bnzi += nlnk;
479         nextrow[k]++; nextci[k]++;
480       }
481     }
482     if (len < bnzi) len = bnzi;  /* =max(bnzi) */
483 
484     /* if free space is not available, make more free space */
485     if (current_space->local_remaining<bnzi) {
486       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
487       nspacedouble++;
488     }
489     /* copy data into free space, then initialize lnk */
490     ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
491     ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr);
492 
493     current_space->array           += bnzi;
494     current_space->local_used      += bnzi;
495     current_space->local_remaining -= bnzi;
496 
497     bi[i+1] = bi[i] + bnzi;
498   }
499 
500   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
501 
502   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
503   ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
504   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
505 
506   /* create symbolic parallel matrix B_mpi */
507   /*---------------------------------------*/
508   ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr);
509   ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr);
510   ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr);
511   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
512 
513   /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */
514   B_mpi->assembled     = PETSC_FALSE;
515   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_SeqsToMPI;
516   merge->bi            = bi;
517   merge->bj            = bj;
518   merge->ci            = ci;
519   merge->cj            = cj;
520   merge->buf_ri        = buf_ri;
521   merge->buf_rj        = buf_rj;
522   merge->C_seq         = PETSC_NULL;
523 
524   /* attach the supporting struct to B_mpi for reuse */
525   ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
526   ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr);
527   ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr);
528   *C = B_mpi;
529 
530   PetscFunctionReturn(0);
531 }
532 
533 #undef __FUNCT__
534 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
535 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
536 {
537   PetscErrorCode       ierr;
538   Mat_Merge_SeqsToMPI  *merge;
539   Mat_MatMatMultMPI    *ptap;
540   PetscObjectContainer cont_merge,cont_ptap;
541   PetscInt             flops=0;
542   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
543   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,
544                        *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data,
545                        *cd=(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
546   Mat                  C_seq;
547   Mat_SeqAIJ           *cseq,*p_oth;
548   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend;
549   PetscInt             *pi_oth,*pj_oth,*pJ_d=pd->j,*pJ_o=po->j,*pjj;
550   PetscInt             i,j,k,nzi,pnzi,apnzj,nextap,pnzj,prow,crow;
551   MatScalar            *ada=ad->a,*aoa=ao->a,*apa,*paj,*cseqa,*caj,**abuf_r,*ba_i,*ca;
552   MatScalar            *pA_d=pd->a,*pA_o=po->a,*pa_oth;
553   PetscInt             am=A->m,cN=C->N,cm=C->m;
554   MPI_Comm             comm=C->comm;
555   PetscMPIInt          size,rank,taga,*len_s,**buf_ri,**buf_rj,**buf_ri_k,**nextrow,**nextcseqi;
556   PetscInt             *owners,proc,nrows;
557   PetscInt             *cjj,cseqnzi,*bj_i,*bi,*bj,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */
558   MPI_Request          *s_waits,*r_waits;
559   MPI_Status           *status;
560   PetscInt             *cseqi,*cseqj,col;
561 
562   PetscFunctionBegin;
563   ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr);
564   if (cont_merge) {
565     ierr  = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr);
566   } else {
567     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
568   }
569   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr);
570   if (cont_ptap) {
571     ierr  = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr);
572   } else {
573     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
574   }
575   /* get data from symbolic products */
576   p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data;
577   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
578 
579   cseqi = merge->ci; cseqj=merge->cj;
580   ierr  = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr);
581   ierr  = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
582 
583   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
584   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
585   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
586 
587   /* Allocate temporary array for storage of one row of A*P and one row of C */
588   ierr = PetscMalloc(2*cN*(sizeof(MatScalar)+sizeof(PetscInt)),&apa);CHKERRQ(ierr);
589   ierr = PetscMemzero(apa,2*cN*(sizeof(MatScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
590   ca       = (MatScalar*)(apa + cN);
591   apj      = (PetscInt *)(ca + cN);
592   apjdense = (PetscInt *)(apj + cN);
593 
594   /* Clear old values in C */
595   ierr = PetscMemzero(cd->a,cd->i[cm]*sizeof(MatScalar));CHKERRQ(ierr);
596   ierr = PetscMemzero(co->a,co->i[cm]*sizeof(MatScalar));CHKERRQ(ierr);
597 
598   for (i=0;i<am;i++) {
599     /* Form i-th sparse row of A*P */
600      apnzj = 0;
601     /* diagonal portion of A */
602     nzi  = adi[i+1] - adi[i];
603     for (j=0;j<nzi;j++) {
604       prow = *adj++;
605       /* diagonal portion of P */
606       pnzj = pd->i[prow+1] - pd->i[prow];
607       pjj  = pd->j + pd->i[prow]; /* local col index of P */
608       paj  = pd->a + pd->i[prow];
609       for (k=0;k<pnzj;k++) {
610         col = *pjj + p->cstart; pjj++; /* global col index of P */
611         if (!apjdense[col]) {
612           apjdense[col] = -1;
613           apj[apnzj++]  = col;
614         }
615         apa[col] += (*ada)*paj[k];
616       }
617       flops += 2*pnzj;
618       /* off-diagonal portion of P */
619       pnzj = po->i[prow+1] - po->i[prow];
620       pjj  = po->j + po->i[prow]; /* local col index of P */
621       paj  = po->a + po->i[prow];
622       for (k=0;k<pnzj;k++) {
623         col = p->garray[*pjj]; pjj++; /* global col index of P */
624         if (!apjdense[col]) {
625           apjdense[col] = -1;
626           apj[apnzj++]  = col;
627         }
628         apa[col] += (*ada)*paj[k];
629       }
630       flops += 2*pnzj;
631 
632       ada++;
633     }
634     /* off-diagonal portion of A */
635     nzi  = aoi[i+1] - aoi[i];
636     for (j=0;j<nzi;j++) {
637       prow = *aoj++;
638       pnzj = pi_oth[prow+1] - pi_oth[prow];
639       pjj  = pj_oth + pi_oth[prow];
640       paj  = pa_oth + pi_oth[prow];
641       for (k=0;k<pnzj;k++) {
642         if (!apjdense[pjj[k]]) {
643           apjdense[pjj[k]] = -1;
644           apj[apnzj++]     = pjj[k];
645         }
646         apa[pjj[k]] += (*aoa)*paj[k];
647       }
648       flops += 2*pnzj;
649       aoa++;
650     }
651     /* Sort the j index array for quick sparse axpy. */
652     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
653 
654     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
655     /* diagonal portion of P -- gives matrix value of local C */
656     pnzi = pd->i[i+1] - pd->i[i];
657     for (j=0;j<pnzi;j++) {
658       crow = (*pJ_d++) + owners[rank];
659       cjj    = cseqj + cseqi[crow];
660       /* add value into C */
661       for (k=0; k<apnzj; k++) ca[k] = (*pA_d)*apa[apj[k]];
662       ierr = MatSetValues(C,1,&crow,apnzj,apj,ca,ADD_VALUES);CHKERRQ(ierr);
663       ierr = PetscMemzero(ca,apnzj*(sizeof(MatScalar)));CHKERRQ(ierr);
664       pA_d++;
665       flops += 2*apnzj;
666     }
667 
668     /* off-diagonal portion of P -- gives matrix values to be sent to others */
669     pnzi = po->i[i+1] - po->i[i];
670     for (j=0;j<pnzi;j++) {
671       crow   = p->garray[*pJ_o++];
672       cjj    = cseqj + cseqi[crow];
673       caj    = cseqa + cseqi[crow];
674       /* add value into C_seq to be sent to other processors */
675       nextap = 0;
676       for (k=0;nextap<apnzj;k++) {
677         if (cjj[k]==apj[nextap]) {
678           caj[k] += (*pA_o)*apa[apj[nextap++]];
679         }
680       }
681       flops += 2*apnzj;
682       pA_o++;
683     }
684 
685     /* Zero the current row info for A*P */
686     for (j=0;j<apnzj;j++) {
687       apa[apj[j]]      = 0.;
688       apjdense[apj[j]] = 0;
689     }
690   }
691 
692   /* send and recv matrix values */
693   /*-----------------------------*/
694   bi     = merge->bi;
695   bj     = merge->bj;
696   buf_ri = merge->buf_ri;
697   buf_rj = merge->buf_rj;
698   len_s  = merge->len_s;
699   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr);
700   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
701 
702   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr);
703   for (proc=0,k=0; proc<size; proc++){
704     if (!len_s[proc]) continue;
705     i = owners[proc];
706     ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
707     k++;
708   }
709   ierr   = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
710   ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);
711   ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);
712   ierr = PetscFree(status);CHKERRQ(ierr);
713 
714   ierr = PetscFree(s_waits);CHKERRQ(ierr);
715   ierr = PetscFree(r_waits);CHKERRQ(ierr);
716   ierr = PetscFree(cseqa);CHKERRQ(ierr);
717 
718   /* insert mat values of mpimat */
719   /*----------------------------*/
720   ba_i = apa; /* rename, temporary array for storage of one row of C */
721   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
722   nextrow = buf_ri_k + merge->nrecv;
723   nextcseqi = nextrow + merge->nrecv;
724 
725   for (k=0; k<merge->nrecv; k++){
726     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
727     nrows        = *(buf_ri_k[k]);
728     nextrow[k]   = buf_ri_k[k]+1;  /* next row index of k-th recved i-structure */
729     nextcseqi[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
730   }
731 
732   /* add received local vals to C */
733   for (i=0; i<cm; i++) {
734     crow = owners[rank] + i;
735     bj_i = bj+bi[i];  /* col indices of the i-th row of C */
736     bnzi = bi[i+1] - bi[i];
737     nzi = 0;
738     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
739       /* i-th row */
740       if (i == *nextrow[k]) {
741         cseqnzi = *(nextcseqi[k]+1) - *nextcseqi[k];
742         cseqj   = buf_rj[k] + *(nextcseqi[k]);
743         cseqa   = abuf_r[k] + *(nextcseqi[k]);
744         nextcseqj = 0;
745         for (j=0; nextcseqj<cseqnzi; j++){
746           if (*(bj_i + j) == cseqj[nextcseqj]){ /* bcol == cseqcol */
747             ba_i[j] += cseqa[nextcseqj++];
748             nzi++;
749           }
750         }
751         nextrow[k]++; nextcseqi[k]++;
752       }
753     }
754     if (nzi>0){
755       ierr = MatSetValues(C,1,&crow,bnzi,bj_i,ba_i,ADD_VALUES);CHKERRQ(ierr);
756       ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr);
757       flops += 2*nzi;
758     }
759   }
760   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
761   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
762 
763   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
764   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
765   ierr = PetscFree(apa);CHKERRQ(ierr);
766   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
767 
768   PetscFunctionReturn(0);
769 }
770 
771 #undef __FUNCT__
772 #define __FUNCT__ "MatPtAP_SeqAIJ_SeqAIJ"
773 PetscErrorCode MatPtAP_SeqAIJ_SeqAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
774 {
775   PetscErrorCode ierr;
776 
777   PetscFunctionBegin;
778   if (scall == MAT_INITIAL_MATRIX){
779     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
780     ierr = MatPtAPSymbolic_SeqAIJ_SeqAIJ(A,P,fill,C);CHKERRQ(ierr);
781     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
782   }
783   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
784   ierr = MatPtAPNumeric_SeqAIJ_SeqAIJ(A,P,*C);CHKERRQ(ierr);
785   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
786   PetscFunctionReturn(0);
787 }
788 
789 /*
790    MatPtAPSymbolic - Creates the (i,j) structure of the matrix projection C = P^T * A * P
791 
792    Collective on Mat
793 
794    Input Parameters:
795 +  A - the matrix
796 -  P - the projection matrix
797 
798    Output Parameters:
799 .  C - the (i,j) structure of the product matrix
800 
801    Notes:
802    C will be created and must be destroyed by the user with MatDestroy().
803 
804    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
805    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.  The product is computed using
806    this (i,j) structure by calling MatPtAPNumeric().
807 
808    Level: intermediate
809 
810 .seealso: MatPtAP(),MatPtAPNumeric(),MatMatMultSymbolic()
811 */
812 #undef __FUNCT__
813 #define __FUNCT__ "MatPtAPSymbolic"
814 PetscErrorCode MatPtAPSymbolic(Mat A,Mat P,PetscReal fill,Mat *C)
815 {
816   PetscErrorCode ierr;
817   PetscErrorCode (*fA)(Mat,Mat,PetscReal,Mat*);
818   PetscErrorCode (*fP)(Mat,Mat,PetscReal,Mat*);
819 
820   PetscFunctionBegin;
821 
822   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
823   PetscValidType(A,1);
824   MatPreallocated(A);
825   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
826   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
827 
828   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
829   PetscValidType(P,2);
830   MatPreallocated(P);
831   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
832   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
833 
834   PetscValidPointer(C,3);
835 
836   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
837   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
838 
839   /* For now, we do not dispatch based on the type of A and P */
840   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
841   fA = A->ops->ptapsymbolic;
842   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for A of type %s",A->type_name);
843   fP = P->ops->ptapsymbolic;
844   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for P of type %s",P->type_name);
845   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPSymbolic requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
846 
847   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
848   ierr = (*fA)(A,P,fill,C);CHKERRQ(ierr);
849   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
850 
851   PetscFunctionReturn(0);
852 }
853 
854 typedef struct {
855   Mat    symAP;
856 } Mat_PtAPstruct;
857 
858 EXTERN PetscErrorCode MatDestroy_SeqAIJ(Mat);
859 
860 #undef __FUNCT__
861 #define __FUNCT__ "MatDestroy_SeqAIJ_PtAP"
862 PetscErrorCode MatDestroy_SeqAIJ_PtAP(Mat A)
863 {
864   PetscErrorCode    ierr;
865   Mat_PtAPstruct    *ptap=(Mat_PtAPstruct*)A->spptr;
866 
867   PetscFunctionBegin;
868   ierr = MatDestroy(ptap->symAP);CHKERRQ(ierr);
869   ierr = PetscFree(ptap);CHKERRQ(ierr);
870   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
871   PetscFunctionReturn(0);
872 }
873 
874 #undef __FUNCT__
875 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqAIJ"
876 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
877 {
878   PetscErrorCode ierr;
879   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
880   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*p = (Mat_SeqAIJ*)P->data,*c;
881   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
882   PetscInt       *ci,*cj,*ptadenserow,*ptasparserow,*ptaj;
883   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M;
884   PetscInt       i,j,k,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi,nlnk,*lnk;
885   MatScalar      *ca;
886   PetscBT        lnkbt;
887 
888   PetscFunctionBegin;
889   /* Get ij structure of P^T */
890   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
891   ptJ=ptj;
892 
893   /* Allocate ci array, arrays for fill computation and */
894   /* free space for accumulating nonzero column info */
895   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
896   ci[0] = 0;
897 
898   ierr = PetscMalloc((2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
899   ierr = PetscMemzero(ptadenserow,(2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
900   ptasparserow = ptadenserow  + an;
901 
902   /* create and initialize a linked list */
903   nlnk = pn+1;
904   ierr = PetscLLCreate(pn,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
905 
906   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
907   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
908   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
909   current_space = free_space;
910 
911   /* Determine symbolic info for each row of C: */
912   for (i=0;i<pn;i++) {
913     ptnzi  = pti[i+1] - pti[i];
914     ptanzi = 0;
915     /* Determine symbolic row of PtA: */
916     for (j=0;j<ptnzi;j++) {
917       arow = *ptJ++;
918       anzj = ai[arow+1] - ai[arow];
919       ajj  = aj + ai[arow];
920       for (k=0;k<anzj;k++) {
921         if (!ptadenserow[ajj[k]]) {
922           ptadenserow[ajj[k]]    = -1;
923           ptasparserow[ptanzi++] = ajj[k];
924         }
925       }
926     }
927       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
928     ptaj = ptasparserow;
929     cnzi   = 0;
930     for (j=0;j<ptanzi;j++) {
931       prow = *ptaj++;
932       pnzj = pi[prow+1] - pi[prow];
933       pjj  = pj + pi[prow];
934       /* add non-zero cols of P into the sorted linked list lnk */
935       ierr = PetscLLAdd(pnzj,pjj,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
936       cnzi += nlnk;
937     }
938 
939     /* If free space is not available, make more free space */
940     /* Double the amount of total space in the list */
941     if (current_space->local_remaining<cnzi) {
942       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
943     }
944 
945     /* Copy data into free space, and zero out denserows */
946     ierr = PetscLLClean(pn,pn,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
947     current_space->array           += cnzi;
948     current_space->local_used      += cnzi;
949     current_space->local_remaining -= cnzi;
950 
951     for (j=0;j<ptanzi;j++) {
952       ptadenserow[ptasparserow[j]] = 0;
953     }
954     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
955     /*        For now, we will recompute what is needed. */
956     ci[i+1] = ci[i] + cnzi;
957   }
958   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
959   /* Allocate space for cj, initialize cj, and */
960   /* destroy list of free space and other temporary array(s) */
961   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
962   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
963   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
964   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
965 
966   /* Allocate space for ca */
967   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
968   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
969 
970   /* put together the new matrix */
971   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
972 
973   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
974   /* Since these are PETSc arrays, change flags to free them as necessary. */
975   c = (Mat_SeqAIJ *)((*C)->data);
976   c->freedata = PETSC_TRUE;
977   c->nonew    = 0;
978 
979   /* Clean up. */
980   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
981 
982   PetscFunctionReturn(0);
983 }
984 
985 #include "src/mat/impls/maij/maij.h"
986 EXTERN_C_BEGIN
987 #undef __FUNCT__
988 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqMAIJ"
989 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqMAIJ(Mat A,Mat PP,Mat *C)
990 {
991   /* This routine requires testing -- I don't think it works. */
992   PetscErrorCode ierr;
993   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
994   Mat_SeqMAIJ    *pp=(Mat_SeqMAIJ*)PP->data;
995   Mat            P=pp->AIJ;
996   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*p=(Mat_SeqAIJ*)P->data,*c;
997   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
998   PetscInt       *ci,*cj,*denserow,*sparserow,*ptadenserow,*ptasparserow,*ptaj;
999   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M,ppdof=pp->dof;
1000   PetscInt       i,j,k,dof,pdof,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi;
1001   MatScalar      *ca;
1002 
1003   PetscFunctionBegin;
1004   /* Start timer */
1005   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1006 
1007   /* Get ij structure of P^T */
1008   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1009 
1010   /* Allocate ci array, arrays for fill computation and */
1011   /* free space for accumulating nonzero column info */
1012   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1013   ci[0] = 0;
1014 
1015   ierr = PetscMalloc((2*pn+2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
1016   ierr = PetscMemzero(ptadenserow,(2*pn+2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
1017   ptasparserow = ptadenserow  + an;
1018   denserow     = ptasparserow + an;
1019   sparserow    = denserow     + pn;
1020 
1021   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
1022   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1023   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
1024   current_space = free_space;
1025 
1026   /* Determine symbolic info for each row of C: */
1027   for (i=0;i<pn/ppdof;i++) {
1028     ptnzi  = pti[i+1] - pti[i];
1029     ptanzi = 0;
1030     ptJ    = ptj + pti[i];
1031     for (dof=0;dof<ppdof;dof++) {
1032     /* Determine symbolic row of PtA: */
1033       for (j=0;j<ptnzi;j++) {
1034         arow = ptJ[j] + dof;
1035         anzj = ai[arow+1] - ai[arow];
1036         ajj  = aj + ai[arow];
1037         for (k=0;k<anzj;k++) {
1038           if (!ptadenserow[ajj[k]]) {
1039             ptadenserow[ajj[k]]    = -1;
1040             ptasparserow[ptanzi++] = ajj[k];
1041           }
1042         }
1043       }
1044       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
1045       ptaj = ptasparserow;
1046       cnzi   = 0;
1047       for (j=0;j<ptanzi;j++) {
1048         pdof = *ptaj%dof;
1049         prow = (*ptaj++)/dof;
1050         pnzj = pi[prow+1] - pi[prow];
1051         pjj  = pj + pi[prow];
1052         for (k=0;k<pnzj;k++) {
1053           if (!denserow[pjj[k]+pdof]) {
1054             denserow[pjj[k]+pdof] = -1;
1055             sparserow[cnzi++]     = pjj[k]+pdof;
1056           }
1057         }
1058       }
1059 
1060       /* sort sparserow */
1061       ierr = PetscSortInt(cnzi,sparserow);CHKERRQ(ierr);
1062 
1063       /* If free space is not available, make more free space */
1064       /* Double the amount of total space in the list */
1065       if (current_space->local_remaining<cnzi) {
1066         ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1067       }
1068 
1069       /* Copy data into free space, and zero out denserows */
1070       ierr = PetscMemcpy(current_space->array,sparserow,cnzi*sizeof(PetscInt));CHKERRQ(ierr);
1071       current_space->array           += cnzi;
1072       current_space->local_used      += cnzi;
1073       current_space->local_remaining -= cnzi;
1074 
1075       for (j=0;j<ptanzi;j++) {
1076         ptadenserow[ptasparserow[j]] = 0;
1077       }
1078       for (j=0;j<cnzi;j++) {
1079         denserow[sparserow[j]] = 0;
1080       }
1081       /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1082       /*        For now, we will recompute what is needed. */
1083       ci[i+1+dof] = ci[i+dof] + cnzi;
1084     }
1085   }
1086   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1087   /* Allocate space for cj, initialize cj, and */
1088   /* destroy list of free space and other temporary array(s) */
1089   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1090   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1091   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
1092 
1093   /* Allocate space for ca */
1094   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1095   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1096 
1097   /* put together the new matrix */
1098   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
1099 
1100   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1101   /* Since these are PETSc arrays, change flags to free them as necessary. */
1102   c = (Mat_SeqAIJ *)((*C)->data);
1103   c->freedata = PETSC_TRUE;
1104   c->nonew    = 0;
1105 
1106   /* Clean up. */
1107   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1108 
1109   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1110   PetscFunctionReturn(0);
1111 }
1112 EXTERN_C_END
1113 
1114 /*
1115    MatPtAPNumeric - Computes the matrix projection C = P^T * A * P
1116 
1117    Collective on Mat
1118 
1119    Input Parameters:
1120 +  A - the matrix
1121 -  P - the projection matrix
1122 
1123    Output Parameters:
1124 .  C - the product matrix
1125 
1126    Notes:
1127    C must have been created by calling MatPtAPSymbolic and must be destroyed by
1128    the user using MatDeatroy().
1129 
1130    This routine is currently only implemented for pairs of AIJ matrices and classes
1131    which inherit from AIJ.  C will be of type MATAIJ.
1132 
1133    Level: intermediate
1134 
1135 .seealso: MatPtAP(),MatPtAPSymbolic(),MatMatMultNumeric()
1136 */
1137 #undef __FUNCT__
1138 #define __FUNCT__ "MatPtAPNumeric"
1139 PetscErrorCode MatPtAPNumeric(Mat A,Mat P,Mat C)
1140 {
1141   PetscErrorCode ierr;
1142   PetscErrorCode (*fA)(Mat,Mat,Mat);
1143   PetscErrorCode (*fP)(Mat,Mat,Mat);
1144 
1145   PetscFunctionBegin;
1146 
1147   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
1148   PetscValidType(A,1);
1149   MatPreallocated(A);
1150   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1151   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1152 
1153   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
1154   PetscValidType(P,2);
1155   MatPreallocated(P);
1156   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1157   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1158 
1159   PetscValidHeaderSpecific(C,MAT_COOKIE,3);
1160   PetscValidType(C,3);
1161   MatPreallocated(C);
1162   if (!C->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1163   if (C->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1164 
1165   if (P->N!=C->M) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->M);
1166   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
1167   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
1168   if (P->N!=C->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->N);
1169 
1170   /* For now, we do not dispatch based on the type of A and P */
1171   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
1172   fA = A->ops->ptapnumeric;
1173   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for A of type %s",A->type_name);
1174   fP = P->ops->ptapnumeric;
1175   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for P of type %s",P->type_name);
1176   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPNumeric requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
1177 
1178   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1179   ierr = (*fA)(A,P,C);CHKERRQ(ierr);
1180   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1181 
1182   PetscFunctionReturn(0);
1183 }
1184 
1185 #undef __FUNCT__
1186 #define __FUNCT__ "MatPtAPNumeric_SeqAIJ_SeqAIJ"
1187 PetscErrorCode MatPtAPNumeric_SeqAIJ_SeqAIJ(Mat A,Mat P,Mat C)
1188 {
1189   PetscErrorCode ierr;
1190   PetscInt       flops=0;
1191   Mat_SeqAIJ     *a  = (Mat_SeqAIJ *) A->data;
1192   Mat_SeqAIJ     *p  = (Mat_SeqAIJ *) P->data;
1193   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
1194   PetscInt       *ai=a->i,*aj=a->j,*apj,*apjdense,*pi=p->i,*pj=p->j,*pJ=p->j,*pjj;
1195   PetscInt       *ci=c->i,*cj=c->j,*cjj;
1196   PetscInt       am=A->M,cn=C->N,cm=C->M;
1197   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1198   MatScalar      *aa=a->a,*apa,*pa=p->a,*pA=p->a,*paj,*ca=c->a,*caj;
1199 
1200   PetscFunctionBegin;
1201   /* Allocate temporary array for storage of one row of A*P */
1202   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1203   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
1204 
1205   apj      = (PetscInt *)(apa + cn);
1206   apjdense = apj + cn;
1207 
1208   /* Clear old values in C */
1209   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1210 
1211   for (i=0;i<am;i++) {
1212     /* Form sparse row of A*P */
1213     anzi  = ai[i+1] - ai[i];
1214     apnzj = 0;
1215     for (j=0;j<anzi;j++) {
1216       prow = *aj++;
1217       pnzj = pi[prow+1] - pi[prow];
1218       pjj  = pj + pi[prow];
1219       paj  = pa + pi[prow];
1220       for (k=0;k<pnzj;k++) {
1221         if (!apjdense[pjj[k]]) {
1222           apjdense[pjj[k]] = -1;
1223           apj[apnzj++]     = pjj[k];
1224         }
1225         apa[pjj[k]] += (*aa)*paj[k];
1226       }
1227       flops += 2*pnzj;
1228       aa++;
1229     }
1230 
1231     /* Sort the j index array for quick sparse axpy. */
1232     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1233 
1234     /* Compute P^T*A*P using outer product (P^T)[:,j]*(A*P)[j,:]. */
1235     pnzi = pi[i+1] - pi[i];
1236     for (j=0;j<pnzi;j++) {
1237       nextap = 0;
1238       crow   = *pJ++;
1239       cjj    = cj + ci[crow];
1240       caj    = ca + ci[crow];
1241       /* Perform sparse axpy operation.  Note cjj includes apj. */
1242       for (k=0;nextap<apnzj;k++) {
1243         if (cjj[k]==apj[nextap]) {
1244           caj[k] += (*pA)*apa[apj[nextap++]];
1245         }
1246       }
1247       flops += 2*apnzj;
1248       pA++;
1249     }
1250 
1251     /* Zero the current row info for A*P */
1252     for (j=0;j<apnzj;j++) {
1253       apa[apj[j]]      = 0.;
1254       apjdense[apj[j]] = 0;
1255     }
1256   }
1257 
1258   /* Assemble the final matrix and clean up */
1259   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1260   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1261   ierr = PetscFree(apa);CHKERRQ(ierr);
1262   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1263 
1264   PetscFunctionReturn(0);
1265 }
1266 
1267 /* Compute local C = P_loc^T * A * P - used by MatPtAP_MPIAIJ_MPIAIJ() */
1268 static PetscEvent logkey_matptapnumeric_local = 0;
1269 #undef __FUNCT__
1270 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ_Local"
1271 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,Mat C)
1272 {
1273   PetscErrorCode ierr;
1274   PetscInt       flops=0;
1275   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
1276   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1277   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
1278   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
1279   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
1280   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col;
1281   PetscInt       *pJ=pj_loc,*pjj;
1282   PetscInt       *ci=c->i,*cj=c->j,*cjj;
1283   PetscInt       am=A->m,cn=C->N,cm=C->M;
1284   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1285   MatScalar      *ada=ad->a,*aoa=ao->a,*apa,*paj,*ca=c->a,*caj;
1286   MatScalar      *pa_loc=p_loc->a,*pA=pa_loc,*pa_oth=p_oth->a;
1287 
1288   PetscFunctionBegin;
1289   if (!logkey_matptapnumeric_local) {
1290     ierr = PetscLogEventRegister(&logkey_matptapnumeric_local,"MatPtAPNumeric_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);CHKERRQ(ierr);
1291   }
1292   ierr = PetscLogEventBegin(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
1293 
1294   /* Allocate temporary array for storage of one row of A*P */
1295   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1296   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
1297   apj      = (PetscInt *)(apa + cn);
1298   apjdense = apj + cn;
1299 
1300   /* Clear old values in C */
1301   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1302 
1303   for (i=0;i<am;i++) {
1304     /* Form i-th sparse row of A*P */
1305      apnzj = 0;
1306     /* diagonal portion of A */
1307     anzi  = adi[i+1] - adi[i];
1308     for (j=0;j<anzi;j++) {
1309       prow = *adj;
1310       adj++;
1311       pnzj = pi_loc[prow+1] - pi_loc[prow];
1312       pjj  = pj_loc + pi_loc[prow];
1313       paj  = pa_loc + pi_loc[prow];
1314       for (k=0;k<pnzj;k++) {
1315         if (!apjdense[pjj[k]]) {
1316           apjdense[pjj[k]] = -1;
1317           apj[apnzj++]     = pjj[k];
1318         }
1319         apa[pjj[k]] += (*ada)*paj[k];
1320       }
1321       flops += 2*pnzj;
1322       ada++;
1323     }
1324     /* off-diagonal portion of A */
1325     anzi  = aoi[i+1] - aoi[i];
1326     for (j=0;j<anzi;j++) {
1327       col = a->garray[*aoj];
1328       if (col < cstart){
1329         prow = *aoj;
1330       } else if (col >= cend){
1331         prow = *aoj;
1332       } else {
1333         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
1334       }
1335       aoj++;
1336       pnzj = pi_oth[prow+1] - pi_oth[prow];
1337       pjj  = pj_oth + pi_oth[prow];
1338       paj  = pa_oth + pi_oth[prow];
1339       for (k=0;k<pnzj;k++) {
1340         if (!apjdense[pjj[k]]) {
1341           apjdense[pjj[k]] = -1;
1342           apj[apnzj++]     = pjj[k];
1343         }
1344         apa[pjj[k]] += (*aoa)*paj[k];
1345       }
1346       flops += 2*pnzj;
1347       aoa++;
1348     }
1349     /* Sort the j index array for quick sparse axpy. */
1350     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1351 
1352     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
1353     pnzi = pi_loc[i+1] - pi_loc[i];
1354     for (j=0;j<pnzi;j++) {
1355       nextap = 0;
1356       crow   = *pJ++;
1357       cjj    = cj + ci[crow];
1358       caj    = ca + ci[crow];
1359       /* Perform sparse axpy operation.  Note cjj includes apj. */
1360       for (k=0;nextap<apnzj;k++) {
1361         if (cjj[k]==apj[nextap]) {
1362           caj[k] += (*pA)*apa[apj[nextap++]];
1363         }
1364       }
1365       flops += 2*apnzj;
1366       pA++;
1367     }
1368 
1369     /* Zero the current row info for A*P */
1370     for (j=0;j<apnzj;j++) {
1371       apa[apj[j]]      = 0.;
1372       apjdense[apj[j]] = 0;
1373     }
1374   }
1375 
1376   /* Assemble the final matrix and clean up */
1377   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1378   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1379   ierr = PetscFree(apa);CHKERRQ(ierr);
1380   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1381   ierr = PetscLogEventEnd(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
1382   PetscFunctionReturn(0);
1383 }
1384 static PetscEvent logkey_matptapsymbolic_local = 0;
1385 #undef __FUNCT__
1386 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local"
1387 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,PetscReal fill,Mat *C)
1388 {
1389   PetscErrorCode ierr;
1390   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1391   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
1392   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*c;
1393   PetscInt       *pti,*ptj,*ptJ,*ajj,*pjj;
1394   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
1395   PetscInt       *ci,*cj,*ptaj;
1396   PetscInt       aN=A->N,am=A->m,pN=P_loc->N;
1397   PetscInt       i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi;
1398   PetscInt       pm=P_loc->m,nlnk,*lnk;
1399   MatScalar      *ca;
1400   PetscBT        lnkbt;
1401   PetscInt       prend,nprow_loc,nprow_oth;
1402   PetscInt       *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
1403   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
1404   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
1405 
1406   PetscFunctionBegin;
1407   if (!logkey_matptapsymbolic_local) {
1408     ierr = PetscLogEventRegister(&logkey_matptapsymbolic_local,"MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);
1409   }
1410   ierr = PetscLogEventBegin(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
1411 
1412   prend = prstart + pm;
1413 
1414   /* get ij structure of P_loc^T */
1415   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1416   ptJ=ptj;
1417 
1418   /* Allocate ci array, arrays for fill computation and */
1419   /* free space for accumulating nonzero column info */
1420   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1421   ci[0] = 0;
1422 
1423   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
1424   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
1425   ptasparserow_loc = ptadenserow_loc + aN;
1426   ptadenserow_oth  = ptasparserow_loc + aN;
1427   ptasparserow_oth = ptadenserow_oth + aN;
1428 
1429   /* create and initialize a linked list */
1430   nlnk = pN+1;
1431   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1432 
1433   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
1434   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1435   nnz           = adi[am] + aoi[am];
1436   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
1437   current_space = free_space;
1438 
1439   /* determine symbolic info for each row of C: */
1440   for (i=0;i<pN;i++) {
1441     ptnzi  = pti[i+1] - pti[i];
1442     nprow_loc = 0; nprow_oth = 0;
1443     /* i-th row of symbolic P_loc^T*A_loc: */
1444     for (j=0;j<ptnzi;j++) {
1445       arow = *ptJ++;
1446       /* diagonal portion of A */
1447       anzj = adi[arow+1] - adi[arow];
1448       ajj  = adj + adi[arow];
1449       for (k=0;k<anzj;k++) {
1450         col = ajj[k]+prstart;
1451         if (!ptadenserow_loc[col]) {
1452           ptadenserow_loc[col]    = -1;
1453           ptasparserow_loc[nprow_loc++] = col;
1454         }
1455       }
1456       /* off-diagonal portion of A */
1457       anzj = aoi[arow+1] - aoi[arow];
1458       ajj  = aoj + aoi[arow];
1459       for (k=0;k<anzj;k++) {
1460         col = a->garray[ajj[k]];  /* global col */
1461         if (col < cstart){
1462           col = ajj[k];
1463         } else if (col >= cend){
1464           col = ajj[k] + pm;
1465         } else {
1466           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
1467         }
1468         if (!ptadenserow_oth[col]) {
1469           ptadenserow_oth[col]    = -1;
1470           ptasparserow_oth[nprow_oth++] = col;
1471         }
1472       }
1473     }
1474 
1475     /* using symbolic info of local PtA, determine symbolic info for row of C: */
1476     cnzi   = 0;
1477     /* rows of P_loc */
1478     ptaj = ptasparserow_loc;
1479     for (j=0; j<nprow_loc; j++) {
1480       prow = *ptaj++;
1481       prow -= prstart; /* rm */
1482       pnzj = pi_loc[prow+1] - pi_loc[prow];
1483       pjj  = pj_loc + pi_loc[prow];
1484       /* add non-zero cols of P into the sorted linked list lnk */
1485       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1486       cnzi += nlnk;
1487     }
1488     /* rows of P_oth */
1489     ptaj = ptasparserow_oth;
1490     for (j=0; j<nprow_oth; j++) {
1491       prow = *ptaj++;
1492       if (prow >= prend) prow -= pm; /* rm */
1493       pnzj = pi_oth[prow+1] - pi_oth[prow];
1494       pjj  = pj_oth + pi_oth[prow];
1495       /* add non-zero cols of P into the sorted linked list lnk */
1496       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1497       cnzi += nlnk;
1498     }
1499 
1500     /* If free space is not available, make more free space */
1501     /* Double the amount of total space in the list */
1502     if (current_space->local_remaining<cnzi) {
1503       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1504     }
1505 
1506     /* Copy data into free space, and zero out denserows */
1507     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1508     current_space->array           += cnzi;
1509     current_space->local_used      += cnzi;
1510     current_space->local_remaining -= cnzi;
1511 
1512     for (j=0;j<nprow_loc; j++) {
1513       ptadenserow_loc[ptasparserow_loc[j]] = 0;
1514     }
1515     for (j=0;j<nprow_oth; j++) {
1516       ptadenserow_oth[ptasparserow_oth[j]] = 0;
1517     }
1518 
1519     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1520     /*        For now, we will recompute what is needed. */
1521     ci[i+1] = ci[i] + cnzi;
1522   }
1523   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1524   /* Allocate space for cj, initialize cj, and */
1525   /* destroy list of free space and other temporary array(s) */
1526   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1527   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1528   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
1529   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1530 
1531   /* Allocate space for ca */
1532   ierr = PetscMalloc((ci[pN]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1533   ierr = PetscMemzero(ca,(ci[pN]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1534 
1535   /* put together the new matrix */
1536   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,pN,pN,ci,cj,ca,C);CHKERRQ(ierr);
1537 
1538   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1539   /* Since these are PETSc arrays, change flags to free them as necessary. */
1540   c = (Mat_SeqAIJ *)((*C)->data);
1541   c->freedata = PETSC_TRUE;
1542   c->nonew    = 0;
1543 
1544   /* Clean up. */
1545   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1546 
1547   ierr = PetscLogEventEnd(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
1548   PetscFunctionReturn(0);
1549 }
1550