xref: /petsc/src/mat/impls/aij/seq/matptap.c (revision ec29af895a5dd9df34b7fd38118d0f8f1c3e718f)
1 /*
2   Defines projective product routines where A is a AIJ matrix
3           C = P^T * A * P
4 */
5 
6 #include "src/mat/impls/aij/seq/aij.h"   /*I "petscmat.h" I*/
7 #include "src/mat/utils/freespace.h"
8 #include "src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatPtAP"
13 /*@
14    MatPtAP - Creates the matrix projection C = P^T * A * P
15 
16    Collective on Mat
17 
18    Input Parameters:
19 +  A - the matrix
20 .  P - the projection matrix
21 .  scall - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
22 -  fill - expected fill as ratio of nnz(C)/(nnz(A) + nnz(P))
23 
24    Output Parameters:
25 .  C - the product matrix
26 
27    Notes:
28    C will be created and must be destroyed by the user with MatDestroy().
29 
30    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
31    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.
32 
33    Level: intermediate
34 
35 .seealso: MatPtAPSymbolic(),MatPtAPNumeric(),MatMatMult()
36 @*/
37 PetscErrorCode MatPtAP(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
38 {
39   PetscErrorCode ierr;
40   PetscErrorCode (*fA)(Mat,Mat,MatReuse,PetscReal,Mat *);
41   PetscErrorCode (*fP)(Mat,Mat,MatReuse,PetscReal,Mat *);
42 
43   PetscFunctionBegin;
44   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
45   PetscValidType(A,1);
46   MatPreallocated(A);
47   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
48   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
49   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
50   PetscValidType(P,2);
51   MatPreallocated(P);
52   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
53   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
54   PetscValidPointer(C,3);
55 
56   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
57 
58   if (fill <=0.0) SETERRQ1(PETSC_ERR_ARG_SIZ,"fill=%g must be > 0.0",fill);
59 
60   /* For now, we do not dispatch based on the type of A and P */
61   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
62   fA = A->ops->ptap;
63   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for A of type %s",A->type_name);
64   fP = P->ops->ptap;
65   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for P of type %s",P->type_name);
66   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAP requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
67 
68   ierr = PetscLogEventBegin(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
69   ierr = (*fA)(A,P,scall,fill,C);CHKERRQ(ierr);
70   ierr = PetscLogEventEnd(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
71   PetscFunctionReturn(0);
72 }
73 
74 EXTERN PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,PetscReal,Mat*);
75 EXTERN PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,Mat C);
76 
77 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
78 #undef __FUNCT__
79 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
80 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
81 {
82   PetscErrorCode       ierr;
83   Mat_MatMatMultMPI    *ptap;
84   PetscObjectContainer container;
85 
86   PetscFunctionBegin;
87   ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
88   if (container) {
89     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
90     ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr);
91     ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr);
92     ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr);
93     ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr);
94 
95     ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
96     ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr);
97   }
98   ierr = PetscFree(ptap);CHKERRQ(ierr);
99 
100   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
101   PetscFunctionReturn(0);
102 }
103 
104 #undef __FUNCT__
105 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
106 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
107 {
108   PetscErrorCode       ierr;
109   Mat_MatMatMultMPI    *ptap;
110   PetscObjectContainer container;
111 
112   PetscFunctionBegin;
113   if (scall == MAT_INITIAL_MATRIX){
114     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
115     ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr);
116 
117     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
118     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
119 
120     /* get P_loc by taking all local rows of P */
121     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
122 
123     /* attach the supporting struct to P for reuse */
124     P->ops->destroy  = MatDestroy_MPIAIJ_PtAP;
125     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
126     ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr);
127     ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr);
128 
129     /* now, compute symbolic local P^T*A*P */
130     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */
131     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
132   } else if (scall == MAT_REUSE_MATRIX){
133     ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
134     if (container) {
135       ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
136     } else {
137       SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
138     }
139     /* update P_oth */
140     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
141 
142   } else {
143     SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall);
144   }
145   /* now, compute numeric local P^T*A*P */
146   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
147   ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
148   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
149 
150   PetscFunctionReturn(0);
151 }
152 /* Set symbolic info for i-th row of local product C=P^T*A*P */
153 #define MatPtAPSymbolicLocal_Private(ptM,pti,ptj,ci,cj) 0;\
154 {\
155   /* allocate ci array, arrays for fill computation and */\
156   /* free space for accumulating nonzero column info */\
157   ierr = PetscMalloc((ptM+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);\
158   ci[0] = 0;\
159 \
160   /* set initial free space to be nnz(A) scaled by fill*P->N/PtM. */\
161   /* this should be reasonable if sparsity of PtAP is similar to that of A. */\
162   nnz           = adi[am] + aoi[am];\
163   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/ptM+1),&free_space);\
164   current_space = free_space;\
165 \
166   ptJ=ptj;\
167   for (i=0; i<ptM; i++) {\
168     ptnzi  = pti[i+1] - pti[i];\
169     nprow_loc = 0; nprow_oth = 0;\
170     /* i-th row of symbolic Pt*A: */\
171     for (j=0;j<ptnzi;j++) {\
172       arow = *ptJ++;\
173       /* diagonal portion of A */\
174       anzj = adi[arow+1] - adi[arow];\
175       ajj  = adj + adi[arow];\
176       for (k=0;k<anzj;k++) {\
177         col = ajj[k]+prstart;\
178         if (!ptadenserow_loc[col]) {\
179           ptadenserow_loc[col]    = -1;\
180           ptasparserow_loc[nprow_loc++] = col;\
181         }\
182       }\
183       /* off-diagonal portion of A */\
184       anzj = aoi[arow+1] - aoi[arow];\
185       ajj  = aoj + aoi[arow];\
186       for (k=0;k<anzj;k++) {\
187         col = a->garray[ajj[k]];  /* global col */\
188         if (col < cstart){\
189           col = ajj[k];\
190         } else if (col >= cend){\
191           col = ajj[k] + pm;\
192         } else {\
193           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");\
194         }\
195         if (!ptadenserow_oth[col]) {\
196           ptadenserow_oth[col]    = -1;\
197           ptasparserow_oth[nprow_oth++] = col;\
198         }\
199       }\
200     }\
201     /* determine symbolic info for row of C_seq: */\
202     cnzi   = 0;\
203     /* rows of P_loc */\
204     ptaj = ptasparserow_loc;\
205     for (j=0; j<nprow_loc; j++) {\
206       prow = *ptaj++; \
207       prow -= prstart; /* rm */\
208       pnzj = pi_loc[prow+1] - pi_loc[prow];\
209       pjj  = pj_loc + pi_loc[prow];\
210       /* add non-zero cols of P into the sorted linked list lnk */\
211       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);\
212       cnzi += nlnk;\
213     }\
214     /* rows of P_oth */\
215     ptaj = ptasparserow_oth;\
216     for (j=0; j<nprow_oth; j++) {\
217       prow = *ptaj++; \
218       if (prow >= prend) prow -= pm; /* rm */\
219       pnzj = pi_oth[prow+1] - pi_oth[prow];\
220       pjj  = pj_oth + pi_oth[prow];\
221       /* add non-zero cols of P into the sorted linked list lnk */\
222       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);\
223       cnzi += nlnk;\
224     }\
225    \
226     /* If free space is not available, make more free space */\
227     /* Double the amount of total space in the list */\
228     if (current_space->local_remaining<cnzi) {\
229       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);\
230     }\
231 \
232     /* Copy data into free space, and zero out denserows */\
233     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);\
234     current_space->array           += cnzi;\
235     current_space->local_used      += cnzi;\
236     current_space->local_remaining -= cnzi;\
237    \
238     for (j=0;j<nprow_loc; j++) {\
239       ptadenserow_loc[ptasparserow_loc[j]] = 0;\
240     }\
241     for (j=0;j<nprow_oth; j++) {\
242       ptadenserow_oth[ptasparserow_oth[j]] = 0;\
243     }\
244     \
245     /* Aside: Perhaps we should save the pta info for the numerical factorization. */\
246     /*        For now, we will recompute what is needed. */ \
247     ci[i+1] = ci[i] + cnzi;\
248   }\
249   /* nnz is now stored in ci[ptm], column indices are in the list of free space */\
250   /* Allocate space for cj, initialize cj, and */\
251   /* destroy list of free space and other temporary array(s) */\
252   ierr = PetscMalloc((ci[ptM]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);\
253   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);\
254 }
255 #undef __FUNCT__
256 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
257 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
258 {
259   PetscErrorCode       ierr;
260   Mat                  P_loc,P_oth;
261   Mat_MatMatMultMPI    *ptap;
262   PetscObjectContainer container;
263   FreeSpaceList        free_space=PETSC_NULL,current_space;
264   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
265   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,
266                        *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
267   Mat_SeqAIJ           *p_loc,*p_oth;
268   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*ajj,*pjj,*rmap=p->garray;
269   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
270   PetscInt             pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji,*ptaj;
271   PetscInt             aN=A->N,am=A->m,pN=P->N;
272   PetscInt             i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi,row;
273   PetscBT              lnkbt;
274   PetscInt             prstart,prend,nprow_loc,nprow_oth;
275   PetscInt             *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
276 
277   MPI_Comm             comm=A->comm;
278   Mat                  B_mpi;
279   PetscMPIInt          size,rank,tagi,tagj,*len_s,*len_si,*len_ri;
280   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
281   PetscInt             len,proc,*dnz,*onz,*owners;
282   PetscInt             anzi,*bi,*bj,bnzi,nspacedouble=0;
283   PetscInt             nrows,tnrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
284   MPI_Request          *si_waits,*sj_waits,*ri_waits,*rj_waits;
285   MPI_Status           *status;
286   Mat_Merge_SeqsToMPI  *merge;
287   PetscInt             *ptdi,*ptdj,*cdi,*cdj;
288 
289   PetscFunctionBegin;
290   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
291   if (container) {
292     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
293   } else {
294     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
295   }
296 
297   /* determine row ownership of C */
298   /*---------------------------------------------------------*/
299   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
300   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
301   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
302 
303   ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr);
304   ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr);
305   ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr);
306   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
307 
308   /* get data from P_loc and P_oth */
309   P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart;
310   p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
311   pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
312   prend = prstart + pm;
313 
314   /* compute symbolic C_seq = P_loc^T * A_loc * P_seq */
315   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
316   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
317   ptasparserow_loc = ptadenserow_loc + aN;
318   ptadenserow_oth  = ptasparserow_loc + aN;
319   ptasparserow_oth = ptadenserow_oth + aN;
320 
321   /* create and initialize a linked list */
322   nlnk = pN+1;
323   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
324 
325   /* Pt = P_loc^T */
326   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
327   ierr = MatPtAPSymbolicLocal_Private(pN,pti,ptj,ci,cj);CHKERRQ(ierr);
328 
329   /* Pt = Pd^T -- only diagonal portion of P_loc */
330   ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&ptdi,&ptdj);CHKERRQ(ierr);
331   ierr = MatPtAPSymbolicLocal_Private(pn,ptdi,ptdj,cdi,cdj);CHKERRQ(ierr);
332   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&ptdi,&ptdj);CHKERRQ(ierr);
333 
334   /* clean up */
335   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
336   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
337   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
338 
339   /* determine the number of messages to send, their lengths */
340   /*---------------------------------------------------------*/
341   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
342   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
343   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
344   len_s  = merge->len_s;
345   len = 0;  /* max length of buf_si[] */
346   merge->nsend = 0;
347   tnrows = (p->B)->N; /* total num of rows to be sent to other processors */
348   proc = 0;
349   for (i=0; i<tnrows; i++){
350     while (rmap[i] >= owners[proc+1]) proc++;
351     len_si[proc]++;
352   }
353   for (proc=0; proc<size; proc++){
354     len_s[proc] = 0;
355     if (len_si[proc]){
356       merge->nsend++;
357       len_si[proc] = 2*(len_si[proc] + 1);
358       len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of col indices to be sent to [proc] */
359       len += len_si[proc];
360     }
361   }
362 
363   /* determine the number and length of messages to receive for ij-structure */
364   /*-------------------------------------------------------------------------*/
365   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
366   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
367 
368   /* post the Irecv of j-structure */
369   /*-------------------------------*/
370   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr);
371   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr);
372 
373   /* post the Isend of j-structure */
374   /*--------------------------------*/
375   ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr);
376   sj_waits = si_waits + merge->nsend;
377 
378   for (proc=0, k=0; proc<size; proc++){
379     if (!len_s[proc]) continue;
380     i = owners[proc];
381     ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr);
382     k++;
383   }
384 
385   /* receives and sends of j-structure are complete */
386   /*------------------------------------------------*/
387   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
388   ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr);
389   ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr);
390 
391   /* send and recv i-structure */
392   /*---------------------------*/
393   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr);
394   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr);
395 
396   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
397   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
398 
399   j = 0; /* row index to be sent */
400   for (proc=0,k=0; proc<size; proc++){
401     if (!len_s[proc]) continue;
402     /* form outgoing message for i-structure:
403          buf_si[0]:                 nrows to be sent
404                [1:nrows]:           row index (global)
405                [nrows+1:2*nrows+1]: i-structure index
406     */
407     /*-------------------------------------------*/
408     nrows = len_si[proc]/2 - 1;
409     buf_si_i    = buf_si + nrows+1;
410     buf_si[0]   = nrows;
411     buf_si_i[0] = 0;
412     for (i=0; i<nrows; i++){
413       row = rmap[j++];
414       anzi = ci[row+1] - ci[row];
415       buf_si_i[i+1] = buf_si_i[i] + anzi; /* i-structure */
416       buf_si[i+1]   = row - owners[proc]; /* local row index */
417     }
418     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr);
419     k++;
420     buf_si += len_si[proc];
421   }
422 
423   ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr);
424   ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr);
425 
426   ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr);
427   for (i=0; i<merge->nrecv; i++){
428     ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI:   recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr);
429   }
430 
431   ierr = PetscFree(len_si);CHKERRQ(ierr);
432   ierr = PetscFree(len_ri);CHKERRQ(ierr);
433   ierr = PetscFree(rj_waits);CHKERRQ(ierr);
434   ierr = PetscFree(si_waits);CHKERRQ(ierr);
435   ierr = PetscFree(ri_waits);CHKERRQ(ierr);
436   ierr = PetscFree(buf_s);CHKERRQ(ierr);
437   ierr = PetscFree(status);CHKERRQ(ierr);
438 
439   /* compute a local seq matrix in each processor */
440   /*----------------------------------------------*/
441   /* allocate bi array and free space for accumulating nonzero column info */
442   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
443   bi[0] = 0;
444 
445   /* create and initialize a linked list */
446   nlnk = pN+1;
447   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
448 
449   /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */
450   len = 0;
451   len  = ci[owners[rank+1]] - ci[owners[rank]];
452   free_space=PETSC_NULL;
453   ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr);
454   current_space = free_space;
455 
456   /* determine symbolic info for each local row of C */
457   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
458   nextrow = buf_ri_k + merge->nrecv;
459   nextci  = nextrow + merge->nrecv;
460   for (k=0; k<merge->nrecv; k++){
461     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
462     nrows = *buf_ri_k[k];
463     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
464     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
465   }
466 
467   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
468   len = 0;
469   for (i=0; i<pn; i++) {
470     bnzi = 0;
471     /* add local non-zero cols of this proc's C_seq into lnk */
472     anzi = cdi[i+1] - cdi[i];
473     cji  = cdj + cdi[i];
474     ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
475     bnzi += nlnk;
476     /* add received col data into lnk */
477     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
478       if (i == *nextrow[k]) { /* i-th row */
479         anzi = *(nextci[k]+1) - *nextci[k];
480         cji  = buf_rj[k] + *nextci[k];
481         ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
482         bnzi += nlnk;
483         nextrow[k]++; nextci[k]++;
484       }
485     }
486     if (len < bnzi) len = bnzi;  /* =max(bnzi) */
487 
488     /* if free space is not available, make more free space */
489     if (current_space->local_remaining<bnzi) {
490       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
491       nspacedouble++;
492     }
493     /* copy data into free space, then initialize lnk */
494     ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
495     ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr);
496 
497     current_space->array           += bnzi;
498     current_space->local_used      += bnzi;
499     current_space->local_remaining -= bnzi;
500 
501     bi[i+1] = bi[i] + bnzi;
502   }
503 
504   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
505 
506   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
507   ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
508   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
509 
510   /* create symbolic parallel matrix B_mpi */
511   /*---------------------------------------*/
512   ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr);
513   ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr);
514   ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr);
515   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
516 
517   /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */
518   B_mpi->assembled     = PETSC_FALSE;
519   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_SeqsToMPI;
520   merge->bi            = bi;
521   merge->bj            = bj;
522   merge->ci            = ci;
523   merge->cj            = cj;
524   merge->buf_ri        = buf_ri;
525   merge->buf_rj        = buf_rj;
526   merge->C_seq         = PETSC_NULL;
527 
528   /* attach the supporting struct to B_mpi for reuse */
529   ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
530   ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr);
531   ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr);
532   *C = B_mpi;
533 
534   ierr = PetscFree(cdi);CHKERRQ(ierr);
535   ierr = PetscFree(cdj);CHKERRQ(ierr);
536 
537   PetscFunctionReturn(0);
538 }
539 
540 #undef __FUNCT__
541 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
542 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
543 {
544   PetscErrorCode       ierr;
545   Mat_Merge_SeqsToMPI  *merge;
546   Mat_MatMatMultMPI    *ptap;
547   PetscObjectContainer cont_merge,cont_ptap;
548   PetscInt             flops=0;
549   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
550   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
551   Mat_SeqAIJ           *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data,*p_oth;
552   Mat_SeqAIJ           *cd=(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
553   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense;
554   PetscInt             *pi_oth,*pj_oth,*pJ_d=pd->j,*pJ_o=po->j,*pjj;
555   PetscInt             i,j,k,nzi,pnzi,apnzj,nextap,pnzj,prow,crow;
556   MatScalar            *ada=ad->a,*aoa=ao->a,*apa,*paj,*cseqa,*caj,**abuf_r,*ba_i,*ca;
557   MatScalar            *pA_d=pd->a,*pA_o=po->a,*pa_oth;
558   PetscInt             am=A->m,cN=C->N,cm=C->m;
559   MPI_Comm             comm=C->comm;
560   PetscMPIInt          size,rank,taga,*len_s,**buf_ri,**buf_rj,**buf_ri_k,**nextrow,**nextcseqi;
561   PetscInt             *owners,proc,nrows;
562   PetscInt             *cjj,cseqnzi,*bj_i,*bi,*bj,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */
563   MPI_Request          *s_waits,*r_waits;
564   MPI_Status           *status;
565   PetscInt             *cseqi,*cseqj,col;
566 
567   PetscFunctionBegin;
568   ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr);
569   if (cont_merge) {
570     ierr  = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr);
571   } else {
572     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
573   }
574   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr);
575   if (cont_ptap) {
576     ierr  = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr);
577   } else {
578     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
579   }
580   /* get data from symbolic products */
581   p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data;
582   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
583 
584   cseqi = merge->ci; cseqj=merge->cj;
585   ierr  = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr);
586   ierr  = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
587 
588   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
589   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
590   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
591 
592   /* Allocate temporary array for storage of one row of A*P and one row of C */
593   ierr = PetscMalloc(2*cN*(sizeof(MatScalar)+sizeof(PetscInt)),&apa);CHKERRQ(ierr);
594   ierr = PetscMemzero(apa,2*cN*(sizeof(MatScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
595   ca       = (MatScalar*)(apa + cN);
596   apj      = (PetscInt *)(ca + cN);
597   apjdense = (PetscInt *)(apj + cN);
598 
599   /* Clear old values in C */
600   ierr = PetscMemzero(cd->a,cd->i[cm]*sizeof(MatScalar));CHKERRQ(ierr);
601   ierr = PetscMemzero(co->a,co->i[cm]*sizeof(MatScalar));CHKERRQ(ierr);
602 
603   for (i=0;i<am;i++) {
604     /* Form i-th sparse row of A*P */
605      apnzj = 0;
606     /* diagonal portion of A */
607     nzi  = adi[i+1] - adi[i];
608     for (j=0;j<nzi;j++) {
609       prow = *adj++;
610       /* diagonal portion of P */
611       pnzj = pd->i[prow+1] - pd->i[prow];
612       pjj  = pd->j + pd->i[prow]; /* local col index of P */
613       paj  = pd->a + pd->i[prow];
614       for (k=0;k<pnzj;k++) {
615         col = *pjj + p->cstart; pjj++; /* global col index of P */
616         if (!apjdense[col]) {
617           apjdense[col] = -1;
618           apj[apnzj++]  = col;
619         }
620         apa[col] += (*ada)*paj[k];
621       }
622       flops += 2*pnzj;
623       /* off-diagonal portion of P */
624       pnzj = po->i[prow+1] - po->i[prow];
625       pjj  = po->j + po->i[prow]; /* local col index of P */
626       paj  = po->a + po->i[prow];
627       for (k=0;k<pnzj;k++) {
628         col = p->garray[*pjj]; pjj++; /* global col index of P */
629         if (!apjdense[col]) {
630           apjdense[col] = -1;
631           apj[apnzj++]  = col;
632         }
633         apa[col] += (*ada)*paj[k];
634       }
635       flops += 2*pnzj;
636 
637       ada++;
638     }
639     /* off-diagonal portion of A */
640     nzi  = aoi[i+1] - aoi[i];
641     for (j=0;j<nzi;j++) {
642       prow = *aoj++;
643       pnzj = pi_oth[prow+1] - pi_oth[prow];
644       pjj  = pj_oth + pi_oth[prow];
645       paj  = pa_oth + pi_oth[prow];
646       for (k=0;k<pnzj;k++) {
647         if (!apjdense[pjj[k]]) {
648           apjdense[pjj[k]] = -1;
649           apj[apnzj++]     = pjj[k];
650         }
651         apa[pjj[k]] += (*aoa)*paj[k];
652       }
653       flops += 2*pnzj;
654       aoa++;
655     }
656     /* Sort the j index array for quick sparse axpy. */
657     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
658 
659     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
660     /* diagonal portion of P -- gives matrix value of local C */
661     pnzi = pd->i[i+1] - pd->i[i];
662     for (j=0;j<pnzi;j++) {
663       /* add value into C */
664       for (k=0; k<apnzj; k++) ca[k] = (*pA_d)*apa[apj[k]];
665       crow = (*pJ_d++) + owners[rank];
666       ierr = MatSetValues(C,1,&crow,apnzj,apj,ca,ADD_VALUES);CHKERRQ(ierr);
667       ierr = PetscMemzero(ca,apnzj*(sizeof(MatScalar)));CHKERRQ(ierr);
668       pA_d++;
669       flops += 2*apnzj;
670     }
671 
672     /* off-diagonal portion of P -- gives matrix values to be sent to others */
673     pnzi = po->i[i+1] - po->i[i];
674     for (j=0;j<pnzi;j++) {
675       crow   = p->garray[*pJ_o++];
676       cjj    = cseqj + cseqi[crow];
677       caj    = cseqa + cseqi[crow];
678       /* add value into C_seq to be sent to other processors */
679       nextap = 0;
680       for (k=0;nextap<apnzj;k++) {
681         if (cjj[k]==apj[nextap]) {
682           caj[k] += (*pA_o)*apa[apj[nextap++]];
683         }
684       }
685       flops += 2*apnzj;
686       pA_o++;
687     }
688 
689     /* Zero the current row info for A*P */
690     for (j=0;j<apnzj;j++) {
691       apa[apj[j]]      = 0.;
692       apjdense[apj[j]] = 0;
693     }
694   }
695 
696   /* send and recv matrix values */
697   /*-----------------------------*/
698   bi     = merge->bi;
699   bj     = merge->bj;
700   buf_ri = merge->buf_ri;
701   buf_rj = merge->buf_rj;
702   len_s  = merge->len_s;
703   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr);
704   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
705 
706   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr);
707   for (proc=0,k=0; proc<size; proc++){
708     if (!len_s[proc]) continue;
709     i = owners[proc];
710     ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
711     k++;
712   }
713   ierr   = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
714   ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);
715   ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);
716   ierr = PetscFree(status);CHKERRQ(ierr);
717 
718   ierr = PetscFree(s_waits);CHKERRQ(ierr);
719   ierr = PetscFree(r_waits);CHKERRQ(ierr);
720   ierr = PetscFree(cseqa);CHKERRQ(ierr);
721 
722   /* insert mat values of mpimat */
723   /*----------------------------*/
724   ba_i = apa; /* rename, temporary array for storage of one row of C */
725   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
726   nextrow = buf_ri_k + merge->nrecv;
727   nextcseqi = nextrow + merge->nrecv;
728 
729   for (k=0; k<merge->nrecv; k++){
730     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
731     nrows        = *(buf_ri_k[k]);
732     nextrow[k]   = buf_ri_k[k]+1;  /* next row index of k-th recved i-structure */
733     nextcseqi[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
734   }
735 
736   /* add received local vals to C */
737   for (i=0; i<cm; i++) {
738     crow = owners[rank] + i;
739     bj_i = bj+bi[i];  /* col indices of the i-th row of C */
740     bnzi = bi[i+1] - bi[i];
741     nzi = 0;
742     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
743       /* i-th row */
744       if (i == *nextrow[k]) {
745         cseqnzi = *(nextcseqi[k]+1) - *nextcseqi[k];
746         cseqj   = buf_rj[k] + *(nextcseqi[k]);
747         cseqa   = abuf_r[k] + *(nextcseqi[k]);
748         nextcseqj = 0;
749         for (j=0; nextcseqj<cseqnzi; j++){
750           if (*(bj_i + j) == cseqj[nextcseqj]){ /* bcol == cseqcol */
751             ba_i[j] += cseqa[nextcseqj++];
752             nzi++;
753           }
754         }
755         nextrow[k]++; nextcseqi[k]++;
756       }
757     }
758     if (nzi>0){
759       ierr = MatSetValues(C,1,&crow,bnzi,bj_i,ba_i,ADD_VALUES);CHKERRQ(ierr);
760       ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr);
761       flops += 2*nzi;
762     }
763   }
764   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
765   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
766 
767   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
768   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
769   ierr = PetscFree(apa);CHKERRQ(ierr);
770   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
771 
772   PetscFunctionReturn(0);
773 }
774 
775 #undef __FUNCT__
776 #define __FUNCT__ "MatPtAP_SeqAIJ_SeqAIJ"
777 PetscErrorCode MatPtAP_SeqAIJ_SeqAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
778 {
779   PetscErrorCode ierr;
780 
781   PetscFunctionBegin;
782   if (scall == MAT_INITIAL_MATRIX){
783     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
784     ierr = MatPtAPSymbolic_SeqAIJ_SeqAIJ(A,P,fill,C);CHKERRQ(ierr);
785     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
786   }
787   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
788   ierr = MatPtAPNumeric_SeqAIJ_SeqAIJ(A,P,*C);CHKERRQ(ierr);
789   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
790   PetscFunctionReturn(0);
791 }
792 
793 /*
794    MatPtAPSymbolic - Creates the (i,j) structure of the matrix projection C = P^T * A * P
795 
796    Collective on Mat
797 
798    Input Parameters:
799 +  A - the matrix
800 -  P - the projection matrix
801 
802    Output Parameters:
803 .  C - the (i,j) structure of the product matrix
804 
805    Notes:
806    C will be created and must be destroyed by the user with MatDestroy().
807 
808    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
809    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.  The product is computed using
810    this (i,j) structure by calling MatPtAPNumeric().
811 
812    Level: intermediate
813 
814 .seealso: MatPtAP(),MatPtAPNumeric(),MatMatMultSymbolic()
815 */
816 #undef __FUNCT__
817 #define __FUNCT__ "MatPtAPSymbolic"
818 PetscErrorCode MatPtAPSymbolic(Mat A,Mat P,PetscReal fill,Mat *C)
819 {
820   PetscErrorCode ierr;
821   PetscErrorCode (*fA)(Mat,Mat,PetscReal,Mat*);
822   PetscErrorCode (*fP)(Mat,Mat,PetscReal,Mat*);
823 
824   PetscFunctionBegin;
825 
826   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
827   PetscValidType(A,1);
828   MatPreallocated(A);
829   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
830   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
831 
832   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
833   PetscValidType(P,2);
834   MatPreallocated(P);
835   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
836   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
837 
838   PetscValidPointer(C,3);
839 
840   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
841   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
842 
843   /* For now, we do not dispatch based on the type of A and P */
844   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
845   fA = A->ops->ptapsymbolic;
846   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for A of type %s",A->type_name);
847   fP = P->ops->ptapsymbolic;
848   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for P of type %s",P->type_name);
849   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPSymbolic requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
850 
851   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
852   ierr = (*fA)(A,P,fill,C);CHKERRQ(ierr);
853   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
854 
855   PetscFunctionReturn(0);
856 }
857 
858 typedef struct {
859   Mat    symAP;
860 } Mat_PtAPstruct;
861 
862 EXTERN PetscErrorCode MatDestroy_SeqAIJ(Mat);
863 
864 #undef __FUNCT__
865 #define __FUNCT__ "MatDestroy_SeqAIJ_PtAP"
866 PetscErrorCode MatDestroy_SeqAIJ_PtAP(Mat A)
867 {
868   PetscErrorCode    ierr;
869   Mat_PtAPstruct    *ptap=(Mat_PtAPstruct*)A->spptr;
870 
871   PetscFunctionBegin;
872   ierr = MatDestroy(ptap->symAP);CHKERRQ(ierr);
873   ierr = PetscFree(ptap);CHKERRQ(ierr);
874   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
875   PetscFunctionReturn(0);
876 }
877 
878 #undef __FUNCT__
879 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqAIJ"
880 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
881 {
882   PetscErrorCode ierr;
883   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
884   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*p = (Mat_SeqAIJ*)P->data,*c;
885   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
886   PetscInt       *ci,*cj,*ptadenserow,*ptasparserow,*ptaj;
887   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M;
888   PetscInt       i,j,k,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi,nlnk,*lnk;
889   MatScalar      *ca;
890   PetscBT        lnkbt;
891 
892   PetscFunctionBegin;
893   /* Get ij structure of P^T */
894   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
895   ptJ=ptj;
896 
897   /* Allocate ci array, arrays for fill computation and */
898   /* free space for accumulating nonzero column info */
899   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
900   ci[0] = 0;
901 
902   ierr = PetscMalloc((2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
903   ierr = PetscMemzero(ptadenserow,(2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
904   ptasparserow = ptadenserow  + an;
905 
906   /* create and initialize a linked list */
907   nlnk = pn+1;
908   ierr = PetscLLCreate(pn,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
909 
910   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
911   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
912   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
913   current_space = free_space;
914 
915   /* Determine symbolic info for each row of C: */
916   for (i=0;i<pn;i++) {
917     ptnzi  = pti[i+1] - pti[i];
918     ptanzi = 0;
919     /* Determine symbolic row of PtA: */
920     for (j=0;j<ptnzi;j++) {
921       arow = *ptJ++;
922       anzj = ai[arow+1] - ai[arow];
923       ajj  = aj + ai[arow];
924       for (k=0;k<anzj;k++) {
925         if (!ptadenserow[ajj[k]]) {
926           ptadenserow[ajj[k]]    = -1;
927           ptasparserow[ptanzi++] = ajj[k];
928         }
929       }
930     }
931       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
932     ptaj = ptasparserow;
933     cnzi   = 0;
934     for (j=0;j<ptanzi;j++) {
935       prow = *ptaj++;
936       pnzj = pi[prow+1] - pi[prow];
937       pjj  = pj + pi[prow];
938       /* add non-zero cols of P into the sorted linked list lnk */
939       ierr = PetscLLAdd(pnzj,pjj,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
940       cnzi += nlnk;
941     }
942 
943     /* If free space is not available, make more free space */
944     /* Double the amount of total space in the list */
945     if (current_space->local_remaining<cnzi) {
946       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
947     }
948 
949     /* Copy data into free space, and zero out denserows */
950     ierr = PetscLLClean(pn,pn,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
951     current_space->array           += cnzi;
952     current_space->local_used      += cnzi;
953     current_space->local_remaining -= cnzi;
954 
955     for (j=0;j<ptanzi;j++) {
956       ptadenserow[ptasparserow[j]] = 0;
957     }
958     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
959     /*        For now, we will recompute what is needed. */
960     ci[i+1] = ci[i] + cnzi;
961   }
962   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
963   /* Allocate space for cj, initialize cj, and */
964   /* destroy list of free space and other temporary array(s) */
965   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
966   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
967   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
968   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
969 
970   /* Allocate space for ca */
971   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
972   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
973 
974   /* put together the new matrix */
975   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
976 
977   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
978   /* Since these are PETSc arrays, change flags to free them as necessary. */
979   c = (Mat_SeqAIJ *)((*C)->data);
980   c->freedata = PETSC_TRUE;
981   c->nonew    = 0;
982 
983   /* Clean up. */
984   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
985 
986   PetscFunctionReturn(0);
987 }
988 
989 #include "src/mat/impls/maij/maij.h"
990 EXTERN_C_BEGIN
991 #undef __FUNCT__
992 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqMAIJ"
993 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqMAIJ(Mat A,Mat PP,Mat *C)
994 {
995   /* This routine requires testing -- I don't think it works. */
996   PetscErrorCode ierr;
997   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
998   Mat_SeqMAIJ    *pp=(Mat_SeqMAIJ*)PP->data;
999   Mat            P=pp->AIJ;
1000   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*p=(Mat_SeqAIJ*)P->data,*c;
1001   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
1002   PetscInt       *ci,*cj,*denserow,*sparserow,*ptadenserow,*ptasparserow,*ptaj;
1003   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M,ppdof=pp->dof;
1004   PetscInt       i,j,k,dof,pdof,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi;
1005   MatScalar      *ca;
1006 
1007   PetscFunctionBegin;
1008   /* Start timer */
1009   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1010 
1011   /* Get ij structure of P^T */
1012   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1013 
1014   /* Allocate ci array, arrays for fill computation and */
1015   /* free space for accumulating nonzero column info */
1016   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1017   ci[0] = 0;
1018 
1019   ierr = PetscMalloc((2*pn+2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
1020   ierr = PetscMemzero(ptadenserow,(2*pn+2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
1021   ptasparserow = ptadenserow  + an;
1022   denserow     = ptasparserow + an;
1023   sparserow    = denserow     + pn;
1024 
1025   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
1026   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1027   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
1028   current_space = free_space;
1029 
1030   /* Determine symbolic info for each row of C: */
1031   for (i=0;i<pn/ppdof;i++) {
1032     ptnzi  = pti[i+1] - pti[i];
1033     ptanzi = 0;
1034     ptJ    = ptj + pti[i];
1035     for (dof=0;dof<ppdof;dof++) {
1036     /* Determine symbolic row of PtA: */
1037       for (j=0;j<ptnzi;j++) {
1038         arow = ptJ[j] + dof;
1039         anzj = ai[arow+1] - ai[arow];
1040         ajj  = aj + ai[arow];
1041         for (k=0;k<anzj;k++) {
1042           if (!ptadenserow[ajj[k]]) {
1043             ptadenserow[ajj[k]]    = -1;
1044             ptasparserow[ptanzi++] = ajj[k];
1045           }
1046         }
1047       }
1048       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
1049       ptaj = ptasparserow;
1050       cnzi   = 0;
1051       for (j=0;j<ptanzi;j++) {
1052         pdof = *ptaj%dof;
1053         prow = (*ptaj++)/dof;
1054         pnzj = pi[prow+1] - pi[prow];
1055         pjj  = pj + pi[prow];
1056         for (k=0;k<pnzj;k++) {
1057           if (!denserow[pjj[k]+pdof]) {
1058             denserow[pjj[k]+pdof] = -1;
1059             sparserow[cnzi++]     = pjj[k]+pdof;
1060           }
1061         }
1062       }
1063 
1064       /* sort sparserow */
1065       ierr = PetscSortInt(cnzi,sparserow);CHKERRQ(ierr);
1066 
1067       /* If free space is not available, make more free space */
1068       /* Double the amount of total space in the list */
1069       if (current_space->local_remaining<cnzi) {
1070         ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1071       }
1072 
1073       /* Copy data into free space, and zero out denserows */
1074       ierr = PetscMemcpy(current_space->array,sparserow,cnzi*sizeof(PetscInt));CHKERRQ(ierr);
1075       current_space->array           += cnzi;
1076       current_space->local_used      += cnzi;
1077       current_space->local_remaining -= cnzi;
1078 
1079       for (j=0;j<ptanzi;j++) {
1080         ptadenserow[ptasparserow[j]] = 0;
1081       }
1082       for (j=0;j<cnzi;j++) {
1083         denserow[sparserow[j]] = 0;
1084       }
1085       /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1086       /*        For now, we will recompute what is needed. */
1087       ci[i+1+dof] = ci[i+dof] + cnzi;
1088     }
1089   }
1090   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1091   /* Allocate space for cj, initialize cj, and */
1092   /* destroy list of free space and other temporary array(s) */
1093   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1094   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1095   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
1096 
1097   /* Allocate space for ca */
1098   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1099   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1100 
1101   /* put together the new matrix */
1102   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
1103 
1104   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1105   /* Since these are PETSc arrays, change flags to free them as necessary. */
1106   c = (Mat_SeqAIJ *)((*C)->data);
1107   c->freedata = PETSC_TRUE;
1108   c->nonew    = 0;
1109 
1110   /* Clean up. */
1111   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1112 
1113   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1114   PetscFunctionReturn(0);
1115 }
1116 EXTERN_C_END
1117 
1118 /*
1119    MatPtAPNumeric - Computes the matrix projection C = P^T * A * P
1120 
1121    Collective on Mat
1122 
1123    Input Parameters:
1124 +  A - the matrix
1125 -  P - the projection matrix
1126 
1127    Output Parameters:
1128 .  C - the product matrix
1129 
1130    Notes:
1131    C must have been created by calling MatPtAPSymbolic and must be destroyed by
1132    the user using MatDeatroy().
1133 
1134    This routine is currently only implemented for pairs of AIJ matrices and classes
1135    which inherit from AIJ.  C will be of type MATAIJ.
1136 
1137    Level: intermediate
1138 
1139 .seealso: MatPtAP(),MatPtAPSymbolic(),MatMatMultNumeric()
1140 */
1141 #undef __FUNCT__
1142 #define __FUNCT__ "MatPtAPNumeric"
1143 PetscErrorCode MatPtAPNumeric(Mat A,Mat P,Mat C)
1144 {
1145   PetscErrorCode ierr;
1146   PetscErrorCode (*fA)(Mat,Mat,Mat);
1147   PetscErrorCode (*fP)(Mat,Mat,Mat);
1148 
1149   PetscFunctionBegin;
1150 
1151   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
1152   PetscValidType(A,1);
1153   MatPreallocated(A);
1154   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1155   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1156 
1157   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
1158   PetscValidType(P,2);
1159   MatPreallocated(P);
1160   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1161   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1162 
1163   PetscValidHeaderSpecific(C,MAT_COOKIE,3);
1164   PetscValidType(C,3);
1165   MatPreallocated(C);
1166   if (!C->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1167   if (C->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1168 
1169   if (P->N!=C->M) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->M);
1170   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
1171   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
1172   if (P->N!=C->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->N);
1173 
1174   /* For now, we do not dispatch based on the type of A and P */
1175   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
1176   fA = A->ops->ptapnumeric;
1177   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for A of type %s",A->type_name);
1178   fP = P->ops->ptapnumeric;
1179   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for P of type %s",P->type_name);
1180   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPNumeric requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
1181 
1182   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1183   ierr = (*fA)(A,P,C);CHKERRQ(ierr);
1184   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1185 
1186   PetscFunctionReturn(0);
1187 }
1188 
1189 #undef __FUNCT__
1190 #define __FUNCT__ "MatPtAPNumeric_SeqAIJ_SeqAIJ"
1191 PetscErrorCode MatPtAPNumeric_SeqAIJ_SeqAIJ(Mat A,Mat P,Mat C)
1192 {
1193   PetscErrorCode ierr;
1194   PetscInt       flops=0;
1195   Mat_SeqAIJ     *a  = (Mat_SeqAIJ *) A->data;
1196   Mat_SeqAIJ     *p  = (Mat_SeqAIJ *) P->data;
1197   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
1198   PetscInt       *ai=a->i,*aj=a->j,*apj,*apjdense,*pi=p->i,*pj=p->j,*pJ=p->j,*pjj;
1199   PetscInt       *ci=c->i,*cj=c->j,*cjj;
1200   PetscInt       am=A->M,cn=C->N,cm=C->M;
1201   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1202   MatScalar      *aa=a->a,*apa,*pa=p->a,*pA=p->a,*paj,*ca=c->a,*caj;
1203 
1204   PetscFunctionBegin;
1205   /* Allocate temporary array for storage of one row of A*P */
1206   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1207   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
1208 
1209   apj      = (PetscInt *)(apa + cn);
1210   apjdense = apj + cn;
1211 
1212   /* Clear old values in C */
1213   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1214 
1215   for (i=0;i<am;i++) {
1216     /* Form sparse row of A*P */
1217     anzi  = ai[i+1] - ai[i];
1218     apnzj = 0;
1219     for (j=0;j<anzi;j++) {
1220       prow = *aj++;
1221       pnzj = pi[prow+1] - pi[prow];
1222       pjj  = pj + pi[prow];
1223       paj  = pa + pi[prow];
1224       for (k=0;k<pnzj;k++) {
1225         if (!apjdense[pjj[k]]) {
1226           apjdense[pjj[k]] = -1;
1227           apj[apnzj++]     = pjj[k];
1228         }
1229         apa[pjj[k]] += (*aa)*paj[k];
1230       }
1231       flops += 2*pnzj;
1232       aa++;
1233     }
1234 
1235     /* Sort the j index array for quick sparse axpy. */
1236     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1237 
1238     /* Compute P^T*A*P using outer product (P^T)[:,j]*(A*P)[j,:]. */
1239     pnzi = pi[i+1] - pi[i];
1240     for (j=0;j<pnzi;j++) {
1241       nextap = 0;
1242       crow   = *pJ++;
1243       cjj    = cj + ci[crow];
1244       caj    = ca + ci[crow];
1245       /* Perform sparse axpy operation.  Note cjj includes apj. */
1246       for (k=0;nextap<apnzj;k++) {
1247         if (cjj[k]==apj[nextap]) {
1248           caj[k] += (*pA)*apa[apj[nextap++]];
1249         }
1250       }
1251       flops += 2*apnzj;
1252       pA++;
1253     }
1254 
1255     /* Zero the current row info for A*P */
1256     for (j=0;j<apnzj;j++) {
1257       apa[apj[j]]      = 0.;
1258       apjdense[apj[j]] = 0;
1259     }
1260   }
1261 
1262   /* Assemble the final matrix and clean up */
1263   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1264   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1265   ierr = PetscFree(apa);CHKERRQ(ierr);
1266   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1267 
1268   PetscFunctionReturn(0);
1269 }
1270 
1271 /* Compute local C = P_loc^T * A * P - used by MatPtAP_MPIAIJ_MPIAIJ() */
1272 static PetscEvent logkey_matptapnumeric_local = 0;
1273 #undef __FUNCT__
1274 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ_Local"
1275 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,Mat C)
1276 {
1277   PetscErrorCode ierr;
1278   PetscInt       flops=0;
1279   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
1280   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1281   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
1282   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
1283   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
1284   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col;
1285   PetscInt       *pJ=pj_loc,*pjj;
1286   PetscInt       *ci=c->i,*cj=c->j,*cjj;
1287   PetscInt       am=A->m,cn=C->N,cm=C->M;
1288   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1289   MatScalar      *ada=ad->a,*aoa=ao->a,*apa,*paj,*ca=c->a,*caj;
1290   MatScalar      *pa_loc=p_loc->a,*pA=pa_loc,*pa_oth=p_oth->a;
1291 
1292   PetscFunctionBegin;
1293   if (!logkey_matptapnumeric_local) {
1294     ierr = PetscLogEventRegister(&logkey_matptapnumeric_local,"MatPtAPNumeric_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);CHKERRQ(ierr);
1295   }
1296   ierr = PetscLogEventBegin(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
1297 
1298   /* Allocate temporary array for storage of one row of A*P */
1299   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1300   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
1301   apj      = (PetscInt *)(apa + cn);
1302   apjdense = apj + cn;
1303 
1304   /* Clear old values in C */
1305   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1306 
1307   for (i=0;i<am;i++) {
1308     /* Form i-th sparse row of A*P */
1309      apnzj = 0;
1310     /* diagonal portion of A */
1311     anzi  = adi[i+1] - adi[i];
1312     for (j=0;j<anzi;j++) {
1313       prow = *adj;
1314       adj++;
1315       pnzj = pi_loc[prow+1] - pi_loc[prow];
1316       pjj  = pj_loc + pi_loc[prow];
1317       paj  = pa_loc + pi_loc[prow];
1318       for (k=0;k<pnzj;k++) {
1319         if (!apjdense[pjj[k]]) {
1320           apjdense[pjj[k]] = -1;
1321           apj[apnzj++]     = pjj[k];
1322         }
1323         apa[pjj[k]] += (*ada)*paj[k];
1324       }
1325       flops += 2*pnzj;
1326       ada++;
1327     }
1328     /* off-diagonal portion of A */
1329     anzi  = aoi[i+1] - aoi[i];
1330     for (j=0;j<anzi;j++) {
1331       col = a->garray[*aoj];
1332       if (col < cstart){
1333         prow = *aoj;
1334       } else if (col >= cend){
1335         prow = *aoj;
1336       } else {
1337         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
1338       }
1339       aoj++;
1340       pnzj = pi_oth[prow+1] - pi_oth[prow];
1341       pjj  = pj_oth + pi_oth[prow];
1342       paj  = pa_oth + pi_oth[prow];
1343       for (k=0;k<pnzj;k++) {
1344         if (!apjdense[pjj[k]]) {
1345           apjdense[pjj[k]] = -1;
1346           apj[apnzj++]     = pjj[k];
1347         }
1348         apa[pjj[k]] += (*aoa)*paj[k];
1349       }
1350       flops += 2*pnzj;
1351       aoa++;
1352     }
1353     /* Sort the j index array for quick sparse axpy. */
1354     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1355 
1356     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
1357     pnzi = pi_loc[i+1] - pi_loc[i];
1358     for (j=0;j<pnzi;j++) {
1359       nextap = 0;
1360       crow   = *pJ++;
1361       cjj    = cj + ci[crow];
1362       caj    = ca + ci[crow];
1363       /* Perform sparse axpy operation.  Note cjj includes apj. */
1364       for (k=0;nextap<apnzj;k++) {
1365         if (cjj[k]==apj[nextap]) {
1366           caj[k] += (*pA)*apa[apj[nextap++]];
1367         }
1368       }
1369       flops += 2*apnzj;
1370       pA++;
1371     }
1372 
1373     /* Zero the current row info for A*P */
1374     for (j=0;j<apnzj;j++) {
1375       apa[apj[j]]      = 0.;
1376       apjdense[apj[j]] = 0;
1377     }
1378   }
1379 
1380   /* Assemble the final matrix and clean up */
1381   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1382   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1383   ierr = PetscFree(apa);CHKERRQ(ierr);
1384   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1385   ierr = PetscLogEventEnd(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
1386   PetscFunctionReturn(0);
1387 }
1388 static PetscEvent logkey_matptapsymbolic_local = 0;
1389 #undef __FUNCT__
1390 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local"
1391 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,PetscReal fill,Mat *C)
1392 {
1393   PetscErrorCode ierr;
1394   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1395   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
1396   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*c;
1397   PetscInt       *pti,*ptj,*ptJ,*ajj,*pjj;
1398   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
1399   PetscInt       *ci,*cj,*ptaj;
1400   PetscInt       aN=A->N,am=A->m,pN=P_loc->N;
1401   PetscInt       i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi;
1402   PetscInt       pm=P_loc->m,nlnk,*lnk;
1403   MatScalar      *ca;
1404   PetscBT        lnkbt;
1405   PetscInt       prend,nprow_loc,nprow_oth;
1406   PetscInt       *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
1407   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
1408   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
1409 
1410   PetscFunctionBegin;
1411   if (!logkey_matptapsymbolic_local) {
1412     ierr = PetscLogEventRegister(&logkey_matptapsymbolic_local,"MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);
1413   }
1414   ierr = PetscLogEventBegin(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
1415 
1416   prend = prstart + pm;
1417 
1418   /* get ij structure of P_loc^T */
1419   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1420   ptJ=ptj;
1421 
1422   /* Allocate ci array, arrays for fill computation and */
1423   /* free space for accumulating nonzero column info */
1424   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1425   ci[0] = 0;
1426 
1427   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
1428   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
1429   ptasparserow_loc = ptadenserow_loc + aN;
1430   ptadenserow_oth  = ptasparserow_loc + aN;
1431   ptasparserow_oth = ptadenserow_oth + aN;
1432 
1433   /* create and initialize a linked list */
1434   nlnk = pN+1;
1435   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1436 
1437   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
1438   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1439   nnz           = adi[am] + aoi[am];
1440   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
1441   current_space = free_space;
1442 
1443   /* determine symbolic info for each row of C: */
1444   for (i=0;i<pN;i++) {
1445     ptnzi  = pti[i+1] - pti[i];
1446     nprow_loc = 0; nprow_oth = 0;
1447     /* i-th row of symbolic P_loc^T*A_loc: */
1448     for (j=0;j<ptnzi;j++) {
1449       arow = *ptJ++;
1450       /* diagonal portion of A */
1451       anzj = adi[arow+1] - adi[arow];
1452       ajj  = adj + adi[arow];
1453       for (k=0;k<anzj;k++) {
1454         col = ajj[k]+prstart;
1455         if (!ptadenserow_loc[col]) {
1456           ptadenserow_loc[col]    = -1;
1457           ptasparserow_loc[nprow_loc++] = col;
1458         }
1459       }
1460       /* off-diagonal portion of A */
1461       anzj = aoi[arow+1] - aoi[arow];
1462       ajj  = aoj + aoi[arow];
1463       for (k=0;k<anzj;k++) {
1464         col = a->garray[ajj[k]];  /* global col */
1465         if (col < cstart){
1466           col = ajj[k];
1467         } else if (col >= cend){
1468           col = ajj[k] + pm;
1469         } else {
1470           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
1471         }
1472         if (!ptadenserow_oth[col]) {
1473           ptadenserow_oth[col]    = -1;
1474           ptasparserow_oth[nprow_oth++] = col;
1475         }
1476       }
1477     }
1478 
1479     /* using symbolic info of local PtA, determine symbolic info for row of C: */
1480     cnzi   = 0;
1481     /* rows of P_loc */
1482     ptaj = ptasparserow_loc;
1483     for (j=0; j<nprow_loc; j++) {
1484       prow = *ptaj++;
1485       prow -= prstart; /* rm */
1486       pnzj = pi_loc[prow+1] - pi_loc[prow];
1487       pjj  = pj_loc + pi_loc[prow];
1488       /* add non-zero cols of P into the sorted linked list lnk */
1489       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1490       cnzi += nlnk;
1491     }
1492     /* rows of P_oth */
1493     ptaj = ptasparserow_oth;
1494     for (j=0; j<nprow_oth; j++) {
1495       prow = *ptaj++;
1496       if (prow >= prend) prow -= pm; /* rm */
1497       pnzj = pi_oth[prow+1] - pi_oth[prow];
1498       pjj  = pj_oth + pi_oth[prow];
1499       /* add non-zero cols of P into the sorted linked list lnk */
1500       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1501       cnzi += nlnk;
1502     }
1503 
1504     /* If free space is not available, make more free space */
1505     /* Double the amount of total space in the list */
1506     if (current_space->local_remaining<cnzi) {
1507       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1508     }
1509 
1510     /* Copy data into free space, and zero out denserows */
1511     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1512     current_space->array           += cnzi;
1513     current_space->local_used      += cnzi;
1514     current_space->local_remaining -= cnzi;
1515 
1516     for (j=0;j<nprow_loc; j++) {
1517       ptadenserow_loc[ptasparserow_loc[j]] = 0;
1518     }
1519     for (j=0;j<nprow_oth; j++) {
1520       ptadenserow_oth[ptasparserow_oth[j]] = 0;
1521     }
1522 
1523     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1524     /*        For now, we will recompute what is needed. */
1525     ci[i+1] = ci[i] + cnzi;
1526   }
1527   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1528   /* Allocate space for cj, initialize cj, and */
1529   /* destroy list of free space and other temporary array(s) */
1530   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1531   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1532   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
1533   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1534 
1535   /* Allocate space for ca */
1536   ierr = PetscMalloc((ci[pN]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1537   ierr = PetscMemzero(ca,(ci[pN]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1538 
1539   /* put together the new matrix */
1540   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,pN,pN,ci,cj,ca,C);CHKERRQ(ierr);
1541 
1542   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1543   /* Since these are PETSc arrays, change flags to free them as necessary. */
1544   c = (Mat_SeqAIJ *)((*C)->data);
1545   c->freedata = PETSC_TRUE;
1546   c->nonew    = 0;
1547 
1548   /* Clean up. */
1549   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1550 
1551   ierr = PetscLogEventEnd(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
1552   PetscFunctionReturn(0);
1553 }
1554