xref: /petsc/src/mat/impls/aij/seq/matptap.c (revision 9ce94108eff7b428f2ec0ab6f6dce926bb5f4283)
1 /*
2   Defines projective product routines where A is a AIJ matrix
3           C = P^T * A * P
4 */
5 
6 #include "src/mat/impls/aij/seq/aij.h"   /*I "petscmat.h" I*/
7 #include "src/mat/utils/freespace.h"
8 #include "src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatPtAP"
13 /*@
14    MatPtAP - Creates the matrix projection C = P^T * A * P
15 
16    Collective on Mat
17 
18    Input Parameters:
19 +  A - the matrix
20 .  P - the projection matrix
21 .  scall - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
22 -  fill - expected fill as ratio of nnz(C)/(nnz(A) + nnz(P))
23 
24    Output Parameters:
25 .  C - the product matrix
26 
27    Notes:
28    C will be created and must be destroyed by the user with MatDestroy().
29 
30    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
31    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.
32 
33    Level: intermediate
34 
35 .seealso: MatPtAPSymbolic(),MatPtAPNumeric(),MatMatMult()
36 @*/
37 PetscErrorCode MatPtAP(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
38 {
39   PetscErrorCode ierr;
40   PetscErrorCode (*fA)(Mat,Mat,MatReuse,PetscReal,Mat *);
41   PetscErrorCode (*fP)(Mat,Mat,MatReuse,PetscReal,Mat *);
42 
43   PetscFunctionBegin;
44   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
45   PetscValidType(A,1);
46   MatPreallocated(A);
47   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
48   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
49   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
50   PetscValidType(P,2);
51   MatPreallocated(P);
52   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
53   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
54   PetscValidPointer(C,3);
55 
56   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
57 
58   if (fill <=0.0) SETERRQ1(PETSC_ERR_ARG_SIZ,"fill=%g must be > 0.0",fill);
59 
60   /* For now, we do not dispatch based on the type of A and P */
61   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
62   fA = A->ops->ptap;
63   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for A of type %s",A->type_name);
64   fP = P->ops->ptap;
65   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for P of type %s",P->type_name);
66   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAP requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
67 
68   ierr = PetscLogEventBegin(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
69   ierr = (*fA)(A,P,scall,fill,C);CHKERRQ(ierr);
70   ierr = PetscLogEventEnd(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
71   PetscFunctionReturn(0);
72 }
73 
74 EXTERN PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,PetscReal,Mat*);
75 EXTERN PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,Mat C);
76 
77 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
78 #undef __FUNCT__
79 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
80 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
81 {
82   PetscErrorCode       ierr;
83   Mat_MatMatMultMPI    *ptap;
84   PetscObjectContainer container;
85 
86   PetscFunctionBegin;
87   ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
88   if (container) {
89     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
90     ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr);
91     ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr);
92     ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr);
93     ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr);
94 
95     ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
96     ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr);
97   }
98   ierr = PetscFree(ptap);CHKERRQ(ierr);
99 
100   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
101   PetscFunctionReturn(0);
102 }
103 
104 #undef __FUNCT__
105 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
106 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
107 {
108   PetscErrorCode       ierr;
109   Mat_MatMatMultMPI    *ptap;
110   PetscObjectContainer container;
111 
112   PetscFunctionBegin;
113   if (scall == MAT_INITIAL_MATRIX){
114     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
115     ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr);
116 
117     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
118     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
119 
120     /* get P_loc by taking all local rows of P */
121     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
122 
123     /* attach the supporting struct to P for reuse */
124     P->ops->destroy  = MatDestroy_MPIAIJ_PtAP;
125     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
126     ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr);
127     ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr);
128 
129     /* now, compute symbolic local P^T*A*P */
130     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */
131     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
132   } else if (scall == MAT_REUSE_MATRIX){
133     ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
134     if (container) {
135       ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
136     } else {
137       SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
138     }
139     /* update P_oth */
140     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
141 
142   } else {
143     SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall);
144   }
145   /* now, compute numeric local P^T*A*P */
146   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
147   ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
148   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
149 
150   PetscFunctionReturn(0);
151 }
152 /* Set symbolic info for i-th row of local product C=P^T*A*P */
153 #define MatPtAPSymbolicLocal_Private(ptM,pti,ptj,ci,cj) 0;\
154 {\
155   /* allocate ci array, arrays for fill computation and */\
156   /* free space for accumulating nonzero column info */\
157   ierr = PetscMalloc((ptM+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);\
158   ci[0] = 0;\
159 \
160   /* set initial free space to be nnz(A) scaled by fill*P->N/PtM. */\
161   /* this should be reasonable if sparsity of PtAP is similar to that of A. */\
162   nnz           = adi[am] + aoi[am];\
163   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/ptM+1),&free_space);\
164   current_space = free_space;\
165 \
166   ptJ=ptj;\
167   for (i=0; i<ptM; i++) {\
168     ptnzi  = pti[i+1] - pti[i];\
169     nprow_loc = 0; nprow_oth = 0;\
170     /* i-th row of symbolic Pt*A: */\
171     for (j=0;j<ptnzi;j++) {\
172       arow = *ptJ++;\
173       /* diagonal portion of A */\
174       anzj = adi[arow+1] - adi[arow];\
175       ajj  = adj + adi[arow];\
176       for (k=0;k<anzj;k++) {\
177         col = ajj[k]+prstart;\
178         if (!ptadenserow_loc[col]) {\
179           ptadenserow_loc[col]    = -1;\
180           ptasparserow_loc[nprow_loc++] = col;\
181         }\
182       }\
183       /* off-diagonal portion of A */\
184       anzj = aoi[arow+1] - aoi[arow];\
185       ajj  = aoj + aoi[arow];\
186       for (k=0;k<anzj;k++) {\
187         col = a->garray[ajj[k]];  /* global col */\
188         if (col < cstart){\
189           col = ajj[k];\
190         } else if (col >= cend){\
191           col = ajj[k] + pm;\
192         } else {\
193           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");\
194         }\
195         if (!ptadenserow_oth[col]) {\
196           ptadenserow_oth[col]    = -1;\
197           ptasparserow_oth[nprow_oth++] = col;\
198         }\
199       }\
200     }\
201     /* determine symbolic info for row of C_seq: */\
202     cnzi   = 0;\
203     /* rows of P_loc */\
204     ptaj = ptasparserow_loc;\
205     for (j=0; j<nprow_loc; j++) {\
206       prow = *ptaj++; \
207       prow -= prstart; /* rm */\
208       pnzj = pi_loc[prow+1] - pi_loc[prow];\
209       pjj  = pj_loc + pi_loc[prow];\
210       /* add non-zero cols of P into the sorted linked list lnk */\
211       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);\
212       cnzi += nlnk;\
213     }\
214     /* rows of P_oth */\
215     ptaj = ptasparserow_oth;\
216     for (j=0; j<nprow_oth; j++) {\
217       prow = *ptaj++; \
218       if (prow >= prend) prow -= pm; /* rm */\
219       pnzj = pi_oth[prow+1] - pi_oth[prow];\
220       pjj  = pj_oth + pi_oth[prow];\
221       /* add non-zero cols of P into the sorted linked list lnk */\
222       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);\
223       cnzi += nlnk;\
224     }\
225    \
226     /* If free space is not available, make more free space */\
227     /* Double the amount of total space in the list */\
228     if (current_space->local_remaining<cnzi) {\
229       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);\
230     }\
231 \
232     /* Copy data into free space, and zero out denserows */\
233     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);\
234     current_space->array           += cnzi;\
235     current_space->local_used      += cnzi;\
236     current_space->local_remaining -= cnzi;\
237    \
238     for (j=0;j<nprow_loc; j++) {\
239       ptadenserow_loc[ptasparserow_loc[j]] = 0;\
240     }\
241     for (j=0;j<nprow_oth; j++) {\
242       ptadenserow_oth[ptasparserow_oth[j]] = 0;\
243     }\
244     \
245     /* Aside: Perhaps we should save the pta info for the numerical factorization. */\
246     /*        For now, we will recompute what is needed. */ \
247     ci[i+1] = ci[i] + cnzi;\
248   }\
249   /* nnz is now stored in ci[ptm], column indices are in the list of free space */\
250   /* Allocate space for cj, initialize cj, and */\
251   /* destroy list of free space and other temporary array(s) */\
252   ierr = PetscMalloc((ci[ptM]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);\
253   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);\
254 }
255 #undef __FUNCT__
256 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
257 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
258 {
259   PetscErrorCode       ierr;
260   Mat                  P_loc,P_oth;
261   Mat_MatMatMultMPI    *ptap;
262   PetscObjectContainer container;
263   FreeSpaceList        free_space=PETSC_NULL,current_space;
264   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
265   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,
266                        *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
267   Mat_SeqAIJ           *p_loc,*p_oth;
268   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*ajj,*pjj,*rmap=p->garray;
269   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
270   PetscInt             pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji,*ptaj;
271   PetscInt             aN=A->N,am=A->m,pN=P->N;
272   PetscInt             i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi,row;
273   PetscBT              lnkbt;
274   PetscInt             prstart,prend,nprow_loc,nprow_oth;
275   PetscInt             *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
276 
277   MPI_Comm             comm=A->comm;
278   Mat                  B_mpi;
279   PetscMPIInt          size,rank,tagi,tagj,*len_s,*len_si,*len_ri;
280   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
281   PetscInt             len,proc,*dnz,*onz,*owners;
282   PetscInt             anzi,*bi,*bj,bnzi,nspacedouble=0;
283   PetscInt             nrows,tnrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
284   MPI_Request          *si_waits,*sj_waits,*ri_waits,*rj_waits;
285   MPI_Status           *status;
286   Mat_Merge_SeqsToMPI  *merge;
287   PetscInt             *ptdi,*ptdj,*cdi,*cdj;
288   PetscInt             *ptoi,*ptoj,*coi,*coj;
289 
290   PetscFunctionBegin;
291   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
292   if (container) {
293     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
294   } else {
295     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
296   }
297 
298   /* determine row ownership of C */
299   /*---------------------------------------------------------*/
300   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
301   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
302   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
303 
304   ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr);
305   ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr);
306   ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr);
307   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
308 
309   /* get data from P_loc and P_oth */
310   P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart;
311   p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
312   pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
313   prend = prstart + pm;
314 
315   /* compute symbolic C_seq = P_loc^T * A_loc * P_seq */
316   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
317   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
318   ptasparserow_loc = ptadenserow_loc + aN;
319   ptadenserow_oth  = ptasparserow_loc + aN;
320   ptasparserow_oth = ptadenserow_oth + aN;
321 
322   /* create and initialize a linked list */
323   nlnk = pN+1;
324   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
325 
326   /* Pt = P_loc^T */
327   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
328   ierr = MatPtAPSymbolicLocal_Private(pN,pti,ptj,ci,cj);CHKERRQ(ierr);
329 
330   /* Pt = Pd^T - diagonal portion of P_loc */
331   ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&ptdi,&ptdj);CHKERRQ(ierr);
332   ierr = MatPtAPSymbolicLocal_Private(pn,ptdi,ptdj,cdi,cdj);CHKERRQ(ierr);
333   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&ptdi,&ptdj);CHKERRQ(ierr);
334 #ifdef NEW
335   /* Pt = Po^T - off-diagonal portion of P_loc */
336   ierr = MatGetSymbolicTranspose_SeqAIJ(p->B,&ptoi,&ptoj);CHKERRQ(ierr);
337   ierr = MatPtAPSymbolicLocal_Private(tnrows,ptoi,ptoj,coi,coj);CHKERRQ(ierr);
338   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&ptoi,&ptoj);CHKERRQ(ierr);
339 #endif
340   /* clean up */
341   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
342   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
343   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
344 
345   /* determine the number of messages to send, their lengths */
346   /*---------------------------------------------------------*/
347   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
348   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
349   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
350   len_s  = merge->len_s;
351   len = 0;  /* max length of buf_si[] */
352   merge->nsend = 0;
353   tnrows = (p->B)->N; /* total num of rows to be sent to other processors */
354   proc = 0;
355   for (i=0; i<tnrows; i++){
356     while (rmap[i] >= owners[proc+1]) proc++;
357     len_si[proc]++;
358   }
359   for (proc=0; proc<size; proc++){
360     len_s[proc] = 0;
361     if (len_si[proc]){
362       merge->nsend++;
363       len_si[proc] = 2*(len_si[proc] + 1);
364       len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of col indices to be sent to [proc] */
365       len += len_si[proc];
366     }
367   }
368 
369   /* determine the number and length of messages to receive for ij-structure */
370   /*-------------------------------------------------------------------------*/
371   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
372   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
373 
374   /* post the Irecv of j-structure */
375   /*-------------------------------*/
376   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr);
377   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr);
378 
379   /* post the Isend of j-structure */
380   /*--------------------------------*/
381   ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr);
382   sj_waits = si_waits + merge->nsend;
383 
384   for (proc=0, k=0; proc<size; proc++){
385     if (!len_s[proc]) continue;
386     i = owners[proc];
387     ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr);
388     k++;
389   }
390 
391   /* receives and sends of j-structure are complete */
392   /*------------------------------------------------*/
393   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
394   ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr);
395   ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr);
396 
397   /* send and recv i-structure */
398   /*---------------------------*/
399   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr);
400   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr);
401 
402   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
403   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
404 
405   j = 0; /* row index to be sent */
406   for (proc=0,k=0; proc<size; proc++){
407     if (!len_s[proc]) continue;
408     /* form outgoing message for i-structure:
409          buf_si[0]:                 nrows to be sent
410                [1:nrows]:           row index (global)
411                [nrows+1:2*nrows+1]: i-structure index
412     */
413     /*-------------------------------------------*/
414     nrows = len_si[proc]/2 - 1;
415     buf_si_i    = buf_si + nrows+1;
416     buf_si[0]   = nrows;
417     buf_si_i[0] = 0;
418     for (i=0; i<nrows; i++){
419       row = rmap[j++];
420       anzi = ci[row+1] - ci[row];
421       buf_si_i[i+1] = buf_si_i[i] + anzi; /* i-structure */
422       buf_si[i+1]   = row - owners[proc]; /* local row index */
423     }
424     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr);
425     k++;
426     buf_si += len_si[proc];
427   }
428 
429   ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr);
430   ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr);
431 
432   ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr);
433   for (i=0; i<merge->nrecv; i++){
434     ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI:   recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr);
435   }
436 
437   ierr = PetscFree(len_si);CHKERRQ(ierr);
438   ierr = PetscFree(len_ri);CHKERRQ(ierr);
439   ierr = PetscFree(rj_waits);CHKERRQ(ierr);
440   ierr = PetscFree(si_waits);CHKERRQ(ierr);
441   ierr = PetscFree(ri_waits);CHKERRQ(ierr);
442   ierr = PetscFree(buf_s);CHKERRQ(ierr);
443   ierr = PetscFree(status);CHKERRQ(ierr);
444 
445   /* compute a local seq matrix in each processor */
446   /*----------------------------------------------*/
447   /* allocate bi array and free space for accumulating nonzero column info */
448   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
449   bi[0] = 0;
450 
451   /* create and initialize a linked list */
452   nlnk = pN+1;
453   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
454 
455   /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */
456   len = 0;
457   len  = ci[owners[rank+1]] - ci[owners[rank]];
458   free_space=PETSC_NULL;
459   ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr);
460   current_space = free_space;
461 
462   /* determine symbolic info for each local row of C */
463   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
464   nextrow = buf_ri_k + merge->nrecv;
465   nextci  = nextrow + merge->nrecv;
466   for (k=0; k<merge->nrecv; k++){
467     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
468     nrows = *buf_ri_k[k];
469     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
470     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
471   }
472 
473   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
474   len = 0;
475   for (i=0; i<pn; i++) {
476     bnzi = 0;
477     /* add local non-zero cols of this proc's C_seq into lnk */
478     anzi = cdi[i+1] - cdi[i];
479     cji  = cdj + cdi[i];
480     ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
481     bnzi += nlnk;
482     /* add received col data into lnk */
483     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
484       if (i == *nextrow[k]) { /* i-th row */
485         anzi = *(nextci[k]+1) - *nextci[k];
486         cji  = buf_rj[k] + *nextci[k];
487         ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
488         bnzi += nlnk;
489         nextrow[k]++; nextci[k]++;
490       }
491     }
492     if (len < bnzi) len = bnzi;  /* =max(bnzi) */
493 
494     /* if free space is not available, make more free space */
495     if (current_space->local_remaining<bnzi) {
496       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
497       nspacedouble++;
498     }
499     /* copy data into free space, then initialize lnk */
500     ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
501     ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr);
502 
503     current_space->array           += bnzi;
504     current_space->local_used      += bnzi;
505     current_space->local_remaining -= bnzi;
506 
507     bi[i+1] = bi[i] + bnzi;
508   }
509 
510   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
511 
512   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
513   ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
514   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
515 
516   /* create symbolic parallel matrix B_mpi */
517   /*---------------------------------------*/
518   ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr);
519   ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr);
520   ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr);
521   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
522 
523   /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */
524   B_mpi->assembled     = PETSC_FALSE;
525   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_SeqsToMPI;
526   merge->bi            = bi;
527   merge->bj            = bj;
528   merge->ci            = ci;
529   merge->cj            = cj;
530   merge->buf_ri        = buf_ri;
531   merge->buf_rj        = buf_rj;
532   merge->C_seq         = PETSC_NULL;
533 
534   /* attach the supporting struct to B_mpi for reuse */
535   ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
536   ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr);
537   ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr);
538   *C = B_mpi;
539 
540   ierr = PetscFree(cdi);CHKERRQ(ierr);
541   ierr = PetscFree(cdj);CHKERRQ(ierr);
542 #ifdef NEW
543   ierr = PetscFree(coi);CHKERRQ(ierr);
544   ierr = PetscFree(coj);CHKERRQ(ierr);
545 #endif
546   PetscFunctionReturn(0);
547 }
548 
549 #undef __FUNCT__
550 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
551 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
552 {
553   PetscErrorCode       ierr;
554   Mat_Merge_SeqsToMPI  *merge;
555   Mat_MatMatMultMPI    *ptap;
556   PetscObjectContainer cont_merge,cont_ptap;
557   PetscInt             flops=0;
558   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
559   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
560   Mat_SeqAIJ           *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data,*p_oth;
561   Mat_SeqAIJ           *cd=(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
562   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense;
563   PetscInt             *pi_oth,*pj_oth,*pJ_d=pd->j,*pJ_o=po->j,*pjj;
564   PetscInt             i,j,k,nzi,pnzi,apnzj,nextap,pnzj,prow,crow;
565   MatScalar            *ada=ad->a,*aoa=ao->a,*apa,*paj,*cseqa,*caj,**abuf_r,*ba_i,*ca;
566   MatScalar            *pA_d=pd->a,*pA_o=po->a,*pa_oth;
567   PetscInt             am=A->m,cN=C->N,cm=C->m;
568   MPI_Comm             comm=C->comm;
569   PetscMPIInt          size,rank,taga,*len_s,**buf_ri,**buf_rj,**buf_ri_k,**nextrow,**nextcseqi;
570   PetscInt             *owners,proc,nrows;
571   PetscInt             *cjj,cseqnzi,*bj_i,*bi,*bj,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */
572   MPI_Request          *s_waits,*r_waits;
573   MPI_Status           *status;
574   PetscInt             *cseqi,*cseqj,col;
575   PetscInt             stages[3];
576 
577   PetscFunctionBegin;
578   ierr = PetscLogStageRegister(&stages[0],"NumAP_local");CHKERRQ(ierr);
579   ierr = PetscLogStageRegister(&stages[1],"NumPtAP_local");CHKERRQ(ierr);
580   ierr = PetscLogStageRegister(&stages[2],"NumPtAP_Comm");CHKERRQ(ierr);
581 
582   ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr);
583   if (cont_merge) {
584     ierr  = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr);
585   } else {
586     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
587   }
588   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr);
589   if (cont_ptap) {
590     ierr  = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr);
591   } else {
592     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
593   }
594   /* get data from symbolic products */
595   p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data;
596   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
597 
598   cseqi = merge->ci; cseqj=merge->cj;
599   ierr  = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr);
600   ierr  = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
601 
602   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
603   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
604   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
605 
606   /* Allocate temporary array for storage of one row of A*P and one row of C */
607   ierr = PetscMalloc(2*cN*(sizeof(MatScalar)+sizeof(PetscInt)),&apa);CHKERRQ(ierr);
608   ierr = PetscMemzero(apa,2*cN*(sizeof(MatScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
609   ca       = (MatScalar*)(apa + cN);
610   apj      = (PetscInt *)(ca + cN);
611   apjdense = (PetscInt *)(apj + cN);
612 
613   /* Clear old values in C */
614   ierr = PetscMemzero(cd->a,cd->i[cm]*sizeof(MatScalar));CHKERRQ(ierr);
615   ierr = PetscMemzero(co->a,co->i[cm]*sizeof(MatScalar));CHKERRQ(ierr);
616 
617   for (i=0;i<am;i++) {
618     ierr = PetscLogStagePush(stages[0]);CHKERRQ(ierr);
619     /* Form i-th sparse row of A*P */
620      apnzj = 0;
621     /* diagonal portion of A */
622     nzi  = adi[i+1] - adi[i];
623     for (j=0;j<nzi;j++) {
624       prow = *adj++;
625       /* diagonal portion of P */
626       pnzj = pd->i[prow+1] - pd->i[prow];
627       pjj  = pd->j + pd->i[prow]; /* local col index of P */
628       paj  = pd->a + pd->i[prow];
629       for (k=0;k<pnzj;k++) {
630         col = *pjj + p->cstart; pjj++; /* global col index of P */
631         if (!apjdense[col]) {
632           apjdense[col] = -1;
633           apj[apnzj++]  = col;
634         }
635         apa[col] += (*ada)*paj[k];
636       }
637       flops += 2*pnzj;
638       /* off-diagonal portion of P */
639       pnzj = po->i[prow+1] - po->i[prow];
640       pjj  = po->j + po->i[prow]; /* local col index of P */
641       paj  = po->a + po->i[prow];
642       for (k=0;k<pnzj;k++) {
643         col = p->garray[*pjj]; pjj++; /* global col index of P */
644         if (!apjdense[col]) {
645           apjdense[col] = -1;
646           apj[apnzj++]  = col;
647         }
648         apa[col] += (*ada)*paj[k];
649       }
650       flops += 2*pnzj;
651 
652       ada++;
653     }
654     /* off-diagonal portion of A */
655     nzi  = aoi[i+1] - aoi[i];
656     for (j=0;j<nzi;j++) {
657       prow = *aoj++;
658       pnzj = pi_oth[prow+1] - pi_oth[prow];
659       pjj  = pj_oth + pi_oth[prow];
660       paj  = pa_oth + pi_oth[prow];
661       for (k=0;k<pnzj;k++) {
662         if (!apjdense[pjj[k]]) {
663           apjdense[pjj[k]] = -1;
664           apj[apnzj++]     = pjj[k];
665         }
666         apa[pjj[k]] += (*aoa)*paj[k];
667       }
668       flops += 2*pnzj;
669       aoa++;
670     }
671     /* Sort the j index array for quick sparse axpy. */
672     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
673 
674     ierr = PetscLogStagePop();
675     ierr = PetscLogStagePush(stages[1]);
676     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
677     /* diagonal portion of P -- gives matrix value of local C */
678     pnzi = pd->i[i+1] - pd->i[i];
679     for (j=0;j<pnzi;j++) {
680       /* add value into C */
681       for (k=0; k<apnzj; k++) ca[k] = (*pA_d)*apa[apj[k]];
682       crow = (*pJ_d++) + owners[rank];
683       ierr = MatSetValues(C,1,&crow,apnzj,apj,ca,ADD_VALUES);CHKERRQ(ierr);
684       ierr = PetscMemzero(ca,apnzj*(sizeof(MatScalar)));CHKERRQ(ierr);
685       pA_d++;
686       flops += 2*apnzj;
687     }
688 
689     /* off-diagonal portion of P -- gives matrix values to be sent to others */
690     pnzi = po->i[i+1] - po->i[i];
691     for (j=0;j<pnzi;j++) {
692       crow   = p->garray[*pJ_o++];
693       cjj    = cseqj + cseqi[crow];
694       caj    = cseqa + cseqi[crow];
695       /* add value into C_seq to be sent to other processors */
696       nextap = 0;
697       for (k=0;nextap<apnzj;k++) {
698         if (cjj[k]==apj[nextap]) {
699           caj[k] += (*pA_o)*apa[apj[nextap++]];
700         }
701       }
702       flops += 2*apnzj;
703       pA_o++;
704     }
705 
706     /* Zero the current row info for A*P */
707     for (j=0;j<apnzj;j++) {
708       apa[apj[j]]      = 0.;
709       apjdense[apj[j]] = 0;
710     }
711     ierr = PetscLogStagePop();
712   }
713 
714   /* send and recv matrix values */
715   /*-----------------------------*/
716   ierr = PetscLogStagePush(stages[2]);CHKERRQ(ierr);
717   bi     = merge->bi;
718   bj     = merge->bj;
719   buf_ri = merge->buf_ri;
720   buf_rj = merge->buf_rj;
721   len_s  = merge->len_s;
722   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr);
723   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
724 
725   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr);
726   for (proc=0,k=0; proc<size; proc++){
727     if (!len_s[proc]) continue;
728     i = owners[proc];
729     ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
730     k++;
731   }
732   ierr   = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
733   ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);
734   ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);
735   ierr = PetscFree(status);CHKERRQ(ierr);
736 
737   ierr = PetscFree(s_waits);CHKERRQ(ierr);
738   ierr = PetscFree(r_waits);CHKERRQ(ierr);
739   ierr = PetscFree(cseqa);CHKERRQ(ierr);
740 
741   /* insert mat values of mpimat */
742   /*----------------------------*/
743   ba_i = apa; /* rename, temporary array for storage of one row of C */
744   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
745   nextrow = buf_ri_k + merge->nrecv;
746   nextcseqi = nextrow + merge->nrecv;
747 
748   for (k=0; k<merge->nrecv; k++){
749     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
750     nrows        = *(buf_ri_k[k]);
751     nextrow[k]   = buf_ri_k[k]+1;  /* next row index of k-th recved i-structure */
752     nextcseqi[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
753   }
754 
755   /* add received local vals to C */
756   for (i=0; i<cm; i++) {
757     crow = owners[rank] + i;
758     bj_i = bj+bi[i];  /* col indices of the i-th row of C */
759     bnzi = bi[i+1] - bi[i];
760     nzi = 0;
761     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
762       /* i-th row */
763       if (i == *nextrow[k]) {
764         cseqnzi = *(nextcseqi[k]+1) - *nextcseqi[k];
765         cseqj   = buf_rj[k] + *(nextcseqi[k]);
766         cseqa   = abuf_r[k] + *(nextcseqi[k]);
767         nextcseqj = 0;
768         for (j=0; nextcseqj<cseqnzi; j++){
769           if (*(bj_i + j) == cseqj[nextcseqj]){ /* bcol == cseqcol */
770             ba_i[j] += cseqa[nextcseqj++];
771             nzi++;
772           }
773         }
774         nextrow[k]++; nextcseqi[k]++;
775       }
776     }
777     if (nzi>0){
778       ierr = MatSetValues(C,1,&crow,bnzi,bj_i,ba_i,ADD_VALUES);CHKERRQ(ierr);
779       ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr);
780       flops += 2*nzi;
781     }
782   }
783   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
784   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
785   ierr = PetscLogStagePop();CHKERRQ(ierr);
786 
787   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
788   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
789   ierr = PetscFree(apa);CHKERRQ(ierr);
790   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
791 
792   PetscFunctionReturn(0);
793 }
794 
795 #undef __FUNCT__
796 #define __FUNCT__ "MatPtAP_SeqAIJ_SeqAIJ"
797 PetscErrorCode MatPtAP_SeqAIJ_SeqAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
798 {
799   PetscErrorCode ierr;
800 
801   PetscFunctionBegin;
802   if (scall == MAT_INITIAL_MATRIX){
803     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
804     ierr = MatPtAPSymbolic_SeqAIJ_SeqAIJ(A,P,fill,C);CHKERRQ(ierr);
805     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
806   }
807   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
808   ierr = MatPtAPNumeric_SeqAIJ_SeqAIJ(A,P,*C);CHKERRQ(ierr);
809   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
810   PetscFunctionReturn(0);
811 }
812 
813 /*
814    MatPtAPSymbolic - Creates the (i,j) structure of the matrix projection C = P^T * A * P
815 
816    Collective on Mat
817 
818    Input Parameters:
819 +  A - the matrix
820 -  P - the projection matrix
821 
822    Output Parameters:
823 .  C - the (i,j) structure of the product matrix
824 
825    Notes:
826    C will be created and must be destroyed by the user with MatDestroy().
827 
828    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
829    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.  The product is computed using
830    this (i,j) structure by calling MatPtAPNumeric().
831 
832    Level: intermediate
833 
834 .seealso: MatPtAP(),MatPtAPNumeric(),MatMatMultSymbolic()
835 */
836 #undef __FUNCT__
837 #define __FUNCT__ "MatPtAPSymbolic"
838 PetscErrorCode MatPtAPSymbolic(Mat A,Mat P,PetscReal fill,Mat *C)
839 {
840   PetscErrorCode ierr;
841   PetscErrorCode (*fA)(Mat,Mat,PetscReal,Mat*);
842   PetscErrorCode (*fP)(Mat,Mat,PetscReal,Mat*);
843 
844   PetscFunctionBegin;
845 
846   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
847   PetscValidType(A,1);
848   MatPreallocated(A);
849   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
850   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
851 
852   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
853   PetscValidType(P,2);
854   MatPreallocated(P);
855   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
856   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
857 
858   PetscValidPointer(C,3);
859 
860   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
861   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
862 
863   /* For now, we do not dispatch based on the type of A and P */
864   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
865   fA = A->ops->ptapsymbolic;
866   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for A of type %s",A->type_name);
867   fP = P->ops->ptapsymbolic;
868   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for P of type %s",P->type_name);
869   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPSymbolic requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
870 
871   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
872   ierr = (*fA)(A,P,fill,C);CHKERRQ(ierr);
873   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
874 
875   PetscFunctionReturn(0);
876 }
877 
878 typedef struct {
879   Mat    symAP;
880 } Mat_PtAPstruct;
881 
882 EXTERN PetscErrorCode MatDestroy_SeqAIJ(Mat);
883 
884 #undef __FUNCT__
885 #define __FUNCT__ "MatDestroy_SeqAIJ_PtAP"
886 PetscErrorCode MatDestroy_SeqAIJ_PtAP(Mat A)
887 {
888   PetscErrorCode    ierr;
889   Mat_PtAPstruct    *ptap=(Mat_PtAPstruct*)A->spptr;
890 
891   PetscFunctionBegin;
892   ierr = MatDestroy(ptap->symAP);CHKERRQ(ierr);
893   ierr = PetscFree(ptap);CHKERRQ(ierr);
894   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
895   PetscFunctionReturn(0);
896 }
897 
898 #undef __FUNCT__
899 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqAIJ"
900 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
901 {
902   PetscErrorCode ierr;
903   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
904   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*p = (Mat_SeqAIJ*)P->data,*c;
905   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
906   PetscInt       *ci,*cj,*ptadenserow,*ptasparserow,*ptaj;
907   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M;
908   PetscInt       i,j,k,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi,nlnk,*lnk;
909   MatScalar      *ca;
910   PetscBT        lnkbt;
911 
912   PetscFunctionBegin;
913   /* Get ij structure of P^T */
914   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
915   ptJ=ptj;
916 
917   /* Allocate ci array, arrays for fill computation and */
918   /* free space for accumulating nonzero column info */
919   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
920   ci[0] = 0;
921 
922   ierr = PetscMalloc((2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
923   ierr = PetscMemzero(ptadenserow,(2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
924   ptasparserow = ptadenserow  + an;
925 
926   /* create and initialize a linked list */
927   nlnk = pn+1;
928   ierr = PetscLLCreate(pn,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
929 
930   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
931   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
932   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
933   current_space = free_space;
934 
935   /* Determine symbolic info for each row of C: */
936   for (i=0;i<pn;i++) {
937     ptnzi  = pti[i+1] - pti[i];
938     ptanzi = 0;
939     /* Determine symbolic row of PtA: */
940     for (j=0;j<ptnzi;j++) {
941       arow = *ptJ++;
942       anzj = ai[arow+1] - ai[arow];
943       ajj  = aj + ai[arow];
944       for (k=0;k<anzj;k++) {
945         if (!ptadenserow[ajj[k]]) {
946           ptadenserow[ajj[k]]    = -1;
947           ptasparserow[ptanzi++] = ajj[k];
948         }
949       }
950     }
951       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
952     ptaj = ptasparserow;
953     cnzi   = 0;
954     for (j=0;j<ptanzi;j++) {
955       prow = *ptaj++;
956       pnzj = pi[prow+1] - pi[prow];
957       pjj  = pj + pi[prow];
958       /* add non-zero cols of P into the sorted linked list lnk */
959       ierr = PetscLLAdd(pnzj,pjj,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
960       cnzi += nlnk;
961     }
962 
963     /* If free space is not available, make more free space */
964     /* Double the amount of total space in the list */
965     if (current_space->local_remaining<cnzi) {
966       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
967     }
968 
969     /* Copy data into free space, and zero out denserows */
970     ierr = PetscLLClean(pn,pn,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
971     current_space->array           += cnzi;
972     current_space->local_used      += cnzi;
973     current_space->local_remaining -= cnzi;
974 
975     for (j=0;j<ptanzi;j++) {
976       ptadenserow[ptasparserow[j]] = 0;
977     }
978     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
979     /*        For now, we will recompute what is needed. */
980     ci[i+1] = ci[i] + cnzi;
981   }
982   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
983   /* Allocate space for cj, initialize cj, and */
984   /* destroy list of free space and other temporary array(s) */
985   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
986   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
987   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
988   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
989 
990   /* Allocate space for ca */
991   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
992   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
993 
994   /* put together the new matrix */
995   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
996 
997   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
998   /* Since these are PETSc arrays, change flags to free them as necessary. */
999   c = (Mat_SeqAIJ *)((*C)->data);
1000   c->freedata = PETSC_TRUE;
1001   c->nonew    = 0;
1002 
1003   /* Clean up. */
1004   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1005 
1006   PetscFunctionReturn(0);
1007 }
1008 
1009 #include "src/mat/impls/maij/maij.h"
1010 EXTERN_C_BEGIN
1011 #undef __FUNCT__
1012 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqMAIJ"
1013 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqMAIJ(Mat A,Mat PP,Mat *C)
1014 {
1015   /* This routine requires testing -- I don't think it works. */
1016   PetscErrorCode ierr;
1017   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1018   Mat_SeqMAIJ    *pp=(Mat_SeqMAIJ*)PP->data;
1019   Mat            P=pp->AIJ;
1020   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*p=(Mat_SeqAIJ*)P->data,*c;
1021   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
1022   PetscInt       *ci,*cj,*denserow,*sparserow,*ptadenserow,*ptasparserow,*ptaj;
1023   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M,ppdof=pp->dof;
1024   PetscInt       i,j,k,dof,pdof,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi;
1025   MatScalar      *ca;
1026 
1027   PetscFunctionBegin;
1028   /* Start timer */
1029   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1030 
1031   /* Get ij structure of P^T */
1032   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1033 
1034   /* Allocate ci array, arrays for fill computation and */
1035   /* free space for accumulating nonzero column info */
1036   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1037   ci[0] = 0;
1038 
1039   ierr = PetscMalloc((2*pn+2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
1040   ierr = PetscMemzero(ptadenserow,(2*pn+2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
1041   ptasparserow = ptadenserow  + an;
1042   denserow     = ptasparserow + an;
1043   sparserow    = denserow     + pn;
1044 
1045   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
1046   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1047   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
1048   current_space = free_space;
1049 
1050   /* Determine symbolic info for each row of C: */
1051   for (i=0;i<pn/ppdof;i++) {
1052     ptnzi  = pti[i+1] - pti[i];
1053     ptanzi = 0;
1054     ptJ    = ptj + pti[i];
1055     for (dof=0;dof<ppdof;dof++) {
1056     /* Determine symbolic row of PtA: */
1057       for (j=0;j<ptnzi;j++) {
1058         arow = ptJ[j] + dof;
1059         anzj = ai[arow+1] - ai[arow];
1060         ajj  = aj + ai[arow];
1061         for (k=0;k<anzj;k++) {
1062           if (!ptadenserow[ajj[k]]) {
1063             ptadenserow[ajj[k]]    = -1;
1064             ptasparserow[ptanzi++] = ajj[k];
1065           }
1066         }
1067       }
1068       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
1069       ptaj = ptasparserow;
1070       cnzi   = 0;
1071       for (j=0;j<ptanzi;j++) {
1072         pdof = *ptaj%dof;
1073         prow = (*ptaj++)/dof;
1074         pnzj = pi[prow+1] - pi[prow];
1075         pjj  = pj + pi[prow];
1076         for (k=0;k<pnzj;k++) {
1077           if (!denserow[pjj[k]+pdof]) {
1078             denserow[pjj[k]+pdof] = -1;
1079             sparserow[cnzi++]     = pjj[k]+pdof;
1080           }
1081         }
1082       }
1083 
1084       /* sort sparserow */
1085       ierr = PetscSortInt(cnzi,sparserow);CHKERRQ(ierr);
1086 
1087       /* If free space is not available, make more free space */
1088       /* Double the amount of total space in the list */
1089       if (current_space->local_remaining<cnzi) {
1090         ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1091       }
1092 
1093       /* Copy data into free space, and zero out denserows */
1094       ierr = PetscMemcpy(current_space->array,sparserow,cnzi*sizeof(PetscInt));CHKERRQ(ierr);
1095       current_space->array           += cnzi;
1096       current_space->local_used      += cnzi;
1097       current_space->local_remaining -= cnzi;
1098 
1099       for (j=0;j<ptanzi;j++) {
1100         ptadenserow[ptasparserow[j]] = 0;
1101       }
1102       for (j=0;j<cnzi;j++) {
1103         denserow[sparserow[j]] = 0;
1104       }
1105       /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1106       /*        For now, we will recompute what is needed. */
1107       ci[i+1+dof] = ci[i+dof] + cnzi;
1108     }
1109   }
1110   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1111   /* Allocate space for cj, initialize cj, and */
1112   /* destroy list of free space and other temporary array(s) */
1113   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1114   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1115   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
1116 
1117   /* Allocate space for ca */
1118   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1119   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1120 
1121   /* put together the new matrix */
1122   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
1123 
1124   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1125   /* Since these are PETSc arrays, change flags to free them as necessary. */
1126   c = (Mat_SeqAIJ *)((*C)->data);
1127   c->freedata = PETSC_TRUE;
1128   c->nonew    = 0;
1129 
1130   /* Clean up. */
1131   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1132 
1133   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1134   PetscFunctionReturn(0);
1135 }
1136 EXTERN_C_END
1137 
1138 /*
1139    MatPtAPNumeric - Computes the matrix projection C = P^T * A * P
1140 
1141    Collective on Mat
1142 
1143    Input Parameters:
1144 +  A - the matrix
1145 -  P - the projection matrix
1146 
1147    Output Parameters:
1148 .  C - the product matrix
1149 
1150    Notes:
1151    C must have been created by calling MatPtAPSymbolic and must be destroyed by
1152    the user using MatDeatroy().
1153 
1154    This routine is currently only implemented for pairs of AIJ matrices and classes
1155    which inherit from AIJ.  C will be of type MATAIJ.
1156 
1157    Level: intermediate
1158 
1159 .seealso: MatPtAP(),MatPtAPSymbolic(),MatMatMultNumeric()
1160 */
1161 #undef __FUNCT__
1162 #define __FUNCT__ "MatPtAPNumeric"
1163 PetscErrorCode MatPtAPNumeric(Mat A,Mat P,Mat C)
1164 {
1165   PetscErrorCode ierr;
1166   PetscErrorCode (*fA)(Mat,Mat,Mat);
1167   PetscErrorCode (*fP)(Mat,Mat,Mat);
1168 
1169   PetscFunctionBegin;
1170 
1171   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
1172   PetscValidType(A,1);
1173   MatPreallocated(A);
1174   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1175   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1176 
1177   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
1178   PetscValidType(P,2);
1179   MatPreallocated(P);
1180   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1181   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1182 
1183   PetscValidHeaderSpecific(C,MAT_COOKIE,3);
1184   PetscValidType(C,3);
1185   MatPreallocated(C);
1186   if (!C->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1187   if (C->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1188 
1189   if (P->N!=C->M) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->M);
1190   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
1191   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
1192   if (P->N!=C->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->N);
1193 
1194   /* For now, we do not dispatch based on the type of A and P */
1195   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
1196   fA = A->ops->ptapnumeric;
1197   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for A of type %s",A->type_name);
1198   fP = P->ops->ptapnumeric;
1199   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for P of type %s",P->type_name);
1200   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPNumeric requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
1201 
1202   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1203   ierr = (*fA)(A,P,C);CHKERRQ(ierr);
1204   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1205 
1206   PetscFunctionReturn(0);
1207 }
1208 
1209 #undef __FUNCT__
1210 #define __FUNCT__ "MatPtAPNumeric_SeqAIJ_SeqAIJ"
1211 PetscErrorCode MatPtAPNumeric_SeqAIJ_SeqAIJ(Mat A,Mat P,Mat C)
1212 {
1213   PetscErrorCode ierr;
1214   PetscInt       flops=0;
1215   Mat_SeqAIJ     *a  = (Mat_SeqAIJ *) A->data;
1216   Mat_SeqAIJ     *p  = (Mat_SeqAIJ *) P->data;
1217   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
1218   PetscInt       *ai=a->i,*aj=a->j,*apj,*apjdense,*pi=p->i,*pj=p->j,*pJ=p->j,*pjj;
1219   PetscInt       *ci=c->i,*cj=c->j,*cjj;
1220   PetscInt       am=A->M,cn=C->N,cm=C->M;
1221   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1222   MatScalar      *aa=a->a,*apa,*pa=p->a,*pA=p->a,*paj,*ca=c->a,*caj;
1223 
1224   PetscFunctionBegin;
1225   /* Allocate temporary array for storage of one row of A*P */
1226   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1227   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
1228 
1229   apj      = (PetscInt *)(apa + cn);
1230   apjdense = apj + cn;
1231 
1232   /* Clear old values in C */
1233   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1234 
1235   for (i=0;i<am;i++) {
1236     /* Form sparse row of A*P */
1237     anzi  = ai[i+1] - ai[i];
1238     apnzj = 0;
1239     for (j=0;j<anzi;j++) {
1240       prow = *aj++;
1241       pnzj = pi[prow+1] - pi[prow];
1242       pjj  = pj + pi[prow];
1243       paj  = pa + pi[prow];
1244       for (k=0;k<pnzj;k++) {
1245         if (!apjdense[pjj[k]]) {
1246           apjdense[pjj[k]] = -1;
1247           apj[apnzj++]     = pjj[k];
1248         }
1249         apa[pjj[k]] += (*aa)*paj[k];
1250       }
1251       flops += 2*pnzj;
1252       aa++;
1253     }
1254 
1255     /* Sort the j index array for quick sparse axpy. */
1256     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1257 
1258     /* Compute P^T*A*P using outer product (P^T)[:,j]*(A*P)[j,:]. */
1259     pnzi = pi[i+1] - pi[i];
1260     for (j=0;j<pnzi;j++) {
1261       nextap = 0;
1262       crow   = *pJ++;
1263       cjj    = cj + ci[crow];
1264       caj    = ca + ci[crow];
1265       /* Perform sparse axpy operation.  Note cjj includes apj. */
1266       for (k=0;nextap<apnzj;k++) {
1267         if (cjj[k]==apj[nextap]) {
1268           caj[k] += (*pA)*apa[apj[nextap++]];
1269         }
1270       }
1271       flops += 2*apnzj;
1272       pA++;
1273     }
1274 
1275     /* Zero the current row info for A*P */
1276     for (j=0;j<apnzj;j++) {
1277       apa[apj[j]]      = 0.;
1278       apjdense[apj[j]] = 0;
1279     }
1280   }
1281 
1282   /* Assemble the final matrix and clean up */
1283   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1284   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1285   ierr = PetscFree(apa);CHKERRQ(ierr);
1286   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1287 
1288   PetscFunctionReturn(0);
1289 }
1290 
1291 /* Compute local C = P_loc^T * A * P - used by MatPtAP_MPIAIJ_MPIAIJ() */
1292 static PetscEvent logkey_matptapnumeric_local = 0;
1293 #undef __FUNCT__
1294 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ_Local"
1295 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,Mat C)
1296 {
1297   PetscErrorCode ierr;
1298   PetscInt       flops=0;
1299   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
1300   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1301   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
1302   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
1303   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
1304   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col;
1305   PetscInt       *pJ=pj_loc,*pjj;
1306   PetscInt       *ci=c->i,*cj=c->j,*cjj;
1307   PetscInt       am=A->m,cn=C->N,cm=C->M;
1308   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1309   MatScalar      *ada=ad->a,*aoa=ao->a,*apa,*paj,*ca=c->a,*caj;
1310   MatScalar      *pa_loc=p_loc->a,*pA=pa_loc,*pa_oth=p_oth->a;
1311 
1312   PetscFunctionBegin;
1313   if (!logkey_matptapnumeric_local) {
1314     ierr = PetscLogEventRegister(&logkey_matptapnumeric_local,"MatPtAPNumeric_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);CHKERRQ(ierr);
1315   }
1316   ierr = PetscLogEventBegin(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
1317 
1318   /* Allocate temporary array for storage of one row of A*P */
1319   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1320   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
1321   apj      = (PetscInt *)(apa + cn);
1322   apjdense = apj + cn;
1323 
1324   /* Clear old values in C */
1325   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1326 
1327   for (i=0;i<am;i++) {
1328     /* Form i-th sparse row of A*P */
1329      apnzj = 0;
1330     /* diagonal portion of A */
1331     anzi  = adi[i+1] - adi[i];
1332     for (j=0;j<anzi;j++) {
1333       prow = *adj;
1334       adj++;
1335       pnzj = pi_loc[prow+1] - pi_loc[prow];
1336       pjj  = pj_loc + pi_loc[prow];
1337       paj  = pa_loc + pi_loc[prow];
1338       for (k=0;k<pnzj;k++) {
1339         if (!apjdense[pjj[k]]) {
1340           apjdense[pjj[k]] = -1;
1341           apj[apnzj++]     = pjj[k];
1342         }
1343         apa[pjj[k]] += (*ada)*paj[k];
1344       }
1345       flops += 2*pnzj;
1346       ada++;
1347     }
1348     /* off-diagonal portion of A */
1349     anzi  = aoi[i+1] - aoi[i];
1350     for (j=0;j<anzi;j++) {
1351       col = a->garray[*aoj];
1352       if (col < cstart){
1353         prow = *aoj;
1354       } else if (col >= cend){
1355         prow = *aoj;
1356       } else {
1357         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
1358       }
1359       aoj++;
1360       pnzj = pi_oth[prow+1] - pi_oth[prow];
1361       pjj  = pj_oth + pi_oth[prow];
1362       paj  = pa_oth + pi_oth[prow];
1363       for (k=0;k<pnzj;k++) {
1364         if (!apjdense[pjj[k]]) {
1365           apjdense[pjj[k]] = -1;
1366           apj[apnzj++]     = pjj[k];
1367         }
1368         apa[pjj[k]] += (*aoa)*paj[k];
1369       }
1370       flops += 2*pnzj;
1371       aoa++;
1372     }
1373     /* Sort the j index array for quick sparse axpy. */
1374     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1375 
1376     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
1377     pnzi = pi_loc[i+1] - pi_loc[i];
1378     for (j=0;j<pnzi;j++) {
1379       nextap = 0;
1380       crow   = *pJ++;
1381       cjj    = cj + ci[crow];
1382       caj    = ca + ci[crow];
1383       /* Perform sparse axpy operation.  Note cjj includes apj. */
1384       for (k=0;nextap<apnzj;k++) {
1385         if (cjj[k]==apj[nextap]) {
1386           caj[k] += (*pA)*apa[apj[nextap++]];
1387         }
1388       }
1389       flops += 2*apnzj;
1390       pA++;
1391     }
1392 
1393     /* Zero the current row info for A*P */
1394     for (j=0;j<apnzj;j++) {
1395       apa[apj[j]]      = 0.;
1396       apjdense[apj[j]] = 0;
1397     }
1398   }
1399 
1400   /* Assemble the final matrix and clean up */
1401   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1402   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1403   ierr = PetscFree(apa);CHKERRQ(ierr);
1404   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1405   ierr = PetscLogEventEnd(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
1406   PetscFunctionReturn(0);
1407 }
1408 static PetscEvent logkey_matptapsymbolic_local = 0;
1409 #undef __FUNCT__
1410 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local"
1411 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,PetscReal fill,Mat *C)
1412 {
1413   PetscErrorCode ierr;
1414   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1415   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
1416   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*c;
1417   PetscInt       *pti,*ptj,*ptJ,*ajj,*pjj;
1418   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
1419   PetscInt       *ci,*cj,*ptaj;
1420   PetscInt       aN=A->N,am=A->m,pN=P_loc->N;
1421   PetscInt       i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi;
1422   PetscInt       pm=P_loc->m,nlnk,*lnk;
1423   MatScalar      *ca;
1424   PetscBT        lnkbt;
1425   PetscInt       prend,nprow_loc,nprow_oth;
1426   PetscInt       *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
1427   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
1428   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
1429 
1430   PetscFunctionBegin;
1431   if (!logkey_matptapsymbolic_local) {
1432     ierr = PetscLogEventRegister(&logkey_matptapsymbolic_local,"MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);
1433   }
1434   ierr = PetscLogEventBegin(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
1435 
1436   prend = prstart + pm;
1437 
1438   /* get ij structure of P_loc^T */
1439   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1440   ptJ=ptj;
1441 
1442   /* Allocate ci array, arrays for fill computation and */
1443   /* free space for accumulating nonzero column info */
1444   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1445   ci[0] = 0;
1446 
1447   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
1448   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
1449   ptasparserow_loc = ptadenserow_loc + aN;
1450   ptadenserow_oth  = ptasparserow_loc + aN;
1451   ptasparserow_oth = ptadenserow_oth + aN;
1452 
1453   /* create and initialize a linked list */
1454   nlnk = pN+1;
1455   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1456 
1457   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
1458   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1459   nnz           = adi[am] + aoi[am];
1460   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
1461   current_space = free_space;
1462 
1463   /* determine symbolic info for each row of C: */
1464   for (i=0;i<pN;i++) {
1465     ptnzi  = pti[i+1] - pti[i];
1466     nprow_loc = 0; nprow_oth = 0;
1467     /* i-th row of symbolic P_loc^T*A_loc: */
1468     for (j=0;j<ptnzi;j++) {
1469       arow = *ptJ++;
1470       /* diagonal portion of A */
1471       anzj = adi[arow+1] - adi[arow];
1472       ajj  = adj + adi[arow];
1473       for (k=0;k<anzj;k++) {
1474         col = ajj[k]+prstart;
1475         if (!ptadenserow_loc[col]) {
1476           ptadenserow_loc[col]    = -1;
1477           ptasparserow_loc[nprow_loc++] = col;
1478         }
1479       }
1480       /* off-diagonal portion of A */
1481       anzj = aoi[arow+1] - aoi[arow];
1482       ajj  = aoj + aoi[arow];
1483       for (k=0;k<anzj;k++) {
1484         col = a->garray[ajj[k]];  /* global col */
1485         if (col < cstart){
1486           col = ajj[k];
1487         } else if (col >= cend){
1488           col = ajj[k] + pm;
1489         } else {
1490           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
1491         }
1492         if (!ptadenserow_oth[col]) {
1493           ptadenserow_oth[col]    = -1;
1494           ptasparserow_oth[nprow_oth++] = col;
1495         }
1496       }
1497     }
1498 
1499     /* using symbolic info of local PtA, determine symbolic info for row of C: */
1500     cnzi   = 0;
1501     /* rows of P_loc */
1502     ptaj = ptasparserow_loc;
1503     for (j=0; j<nprow_loc; j++) {
1504       prow = *ptaj++;
1505       prow -= prstart; /* rm */
1506       pnzj = pi_loc[prow+1] - pi_loc[prow];
1507       pjj  = pj_loc + pi_loc[prow];
1508       /* add non-zero cols of P into the sorted linked list lnk */
1509       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1510       cnzi += nlnk;
1511     }
1512     /* rows of P_oth */
1513     ptaj = ptasparserow_oth;
1514     for (j=0; j<nprow_oth; j++) {
1515       prow = *ptaj++;
1516       if (prow >= prend) prow -= pm; /* rm */
1517       pnzj = pi_oth[prow+1] - pi_oth[prow];
1518       pjj  = pj_oth + pi_oth[prow];
1519       /* add non-zero cols of P into the sorted linked list lnk */
1520       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1521       cnzi += nlnk;
1522     }
1523 
1524     /* If free space is not available, make more free space */
1525     /* Double the amount of total space in the list */
1526     if (current_space->local_remaining<cnzi) {
1527       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1528     }
1529 
1530     /* Copy data into free space, and zero out denserows */
1531     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1532     current_space->array           += cnzi;
1533     current_space->local_used      += cnzi;
1534     current_space->local_remaining -= cnzi;
1535 
1536     for (j=0;j<nprow_loc; j++) {
1537       ptadenserow_loc[ptasparserow_loc[j]] = 0;
1538     }
1539     for (j=0;j<nprow_oth; j++) {
1540       ptadenserow_oth[ptasparserow_oth[j]] = 0;
1541     }
1542 
1543     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1544     /*        For now, we will recompute what is needed. */
1545     ci[i+1] = ci[i] + cnzi;
1546   }
1547   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1548   /* Allocate space for cj, initialize cj, and */
1549   /* destroy list of free space and other temporary array(s) */
1550   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1551   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1552   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
1553   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1554 
1555   /* Allocate space for ca */
1556   ierr = PetscMalloc((ci[pN]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1557   ierr = PetscMemzero(ca,(ci[pN]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1558 
1559   /* put together the new matrix */
1560   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,pN,pN,ci,cj,ca,C);CHKERRQ(ierr);
1561 
1562   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1563   /* Since these are PETSc arrays, change flags to free them as necessary. */
1564   c = (Mat_SeqAIJ *)((*C)->data);
1565   c->freedata = PETSC_TRUE;
1566   c->nonew    = 0;
1567 
1568   /* Clean up. */
1569   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1570 
1571   ierr = PetscLogEventEnd(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
1572   PetscFunctionReturn(0);
1573 }
1574