xref: /petsc/src/mat/impls/aij/seq/matptap.c (revision 3fdb3dc0f431f518041d47ce9f59c180e66258ef)
1 /*
2   Defines projective product routines where A is a AIJ matrix
3           C = P^T * A * P
4 */
5 
6 #include "src/mat/impls/aij/seq/aij.h"   /*I "petscmat.h" I*/
7 #include "src/mat/utils/freespace.h"
8 #include "src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatPtAP"
13 /*@
14    MatPtAP - Creates the matrix projection C = P^T * A * P
15 
16    Collective on Mat
17 
18    Input Parameters:
19 +  A - the matrix
20 .  P - the projection matrix
21 .  scall - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
22 -  fill - expected fill as ratio of nnz(C)/(nnz(A) + nnz(P))
23 
24    Output Parameters:
25 .  C - the product matrix
26 
27    Notes:
28    C will be created and must be destroyed by the user with MatDestroy().
29 
30    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
31    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.
32 
33    Level: intermediate
34 
35 .seealso: MatPtAPSymbolic(),MatPtAPNumeric(),MatMatMult()
36 @*/
37 PetscErrorCode MatPtAP(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
38 {
39   PetscErrorCode ierr;
40   PetscErrorCode (*fA)(Mat,Mat,MatReuse,PetscReal,Mat *);
41   PetscErrorCode (*fP)(Mat,Mat,MatReuse,PetscReal,Mat *);
42 
43   PetscFunctionBegin;
44   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
45   PetscValidType(A,1);
46   MatPreallocated(A);
47   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
48   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
49   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
50   PetscValidType(P,2);
51   MatPreallocated(P);
52   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
53   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
54   PetscValidPointer(C,3);
55 
56   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
57 
58   if (fill <=0.0) SETERRQ1(PETSC_ERR_ARG_SIZ,"fill=%g must be > 0.0",fill);
59 
60   /* For now, we do not dispatch based on the type of A and P */
61   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
62   fA = A->ops->ptap;
63   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for A of type %s",A->type_name);
64   fP = P->ops->ptap;
65   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for P of type %s",P->type_name);
66   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAP requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
67 
68   ierr = PetscLogEventBegin(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
69   ierr = (*fA)(A,P,scall,fill,C);CHKERRQ(ierr);
70   ierr = PetscLogEventEnd(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
71   PetscFunctionReturn(0);
72 }
73 
74 EXTERN PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,PetscReal,Mat*);
75 EXTERN PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,Mat C);
76 
77 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
78 #undef __FUNCT__
79 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
80 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
81 {
82   PetscErrorCode       ierr;
83   Mat_MatMatMultMPI    *ptap;
84   PetscObjectContainer container;
85 
86   PetscFunctionBegin;
87   ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
88   if (container) {
89     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
90     ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr);
91     ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr);
92     ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr);
93     ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr);
94 
95     ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
96     ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr);
97   }
98   ierr = PetscFree(ptap);CHKERRQ(ierr);
99 
100   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
101   PetscFunctionReturn(0);
102 }
103 
104 #undef __FUNCT__
105 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
106 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
107 {
108   PetscErrorCode       ierr;
109   Mat_MatMatMultMPI    *ptap;
110   PetscObjectContainer container;
111 
112   PetscFunctionBegin;
113   if (scall == MAT_INITIAL_MATRIX){
114     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
115     ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr);
116 
117     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
118     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
119 
120     /* get P_loc by taking all local rows of P */
121     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
122 
123     /* attach the supporting struct to P for reuse */
124     P->ops->destroy  = MatDestroy_MPIAIJ_PtAP;
125     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
126     ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr);
127     ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr);
128 
129     /* now, compute symbolic local P^T*A*P */
130     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */
131     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
132   } else if (scall == MAT_REUSE_MATRIX){
133     ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
134     if (container) {
135       ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
136     } else {
137       SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
138     }
139 
140     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
141     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
142 
143     /* get P_loc by taking all local rows of P */
144     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
145 
146   } else {
147     SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall);
148   }
149   /* now, compute numeric local P^T*A*P */
150   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
151   ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
152   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
153 
154   PetscFunctionReturn(0);
155 }
156 
157 #undef __FUNCT__
158 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
159 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
160 {
161   PetscErrorCode       ierr;
162   Mat                  P_loc,P_oth;
163   Mat_MatMatMultMPI    *ptap;
164   PetscObjectContainer container;
165   FreeSpaceList        free_space=PETSC_NULL,current_space=PETSC_NULL;
166   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data;
167   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
168   Mat_SeqAIJ           *p_loc,*p_oth;
169   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*ajj,*pjj;
170   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
171   PetscInt             pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji,*ptaj;
172   PetscInt             aN=A->N,am=A->m,pN=P->N;
173   PetscInt             i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi;
174   PetscBT              lnkbt;
175   PetscInt             prstart,prend,nprow_loc,nprow_oth;
176   PetscInt             *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
177 
178   MPI_Comm             comm=A->comm;
179   Mat                  B_mpi;
180   PetscMPIInt          size,rank,tagi,tagj,*len_s,*len_si,*len_ri;
181   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
182   PetscInt             len,proc,*dnz,*onz,*owners;
183   PetscInt             anzi,*bi,*bj,bnzi,nspacedouble=0;
184   PetscInt             nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
185   MPI_Request          *si_waits,*sj_waits,*ri_waits,*rj_waits;
186   MPI_Status           *status;
187   Mat_Merge_SeqsToMPI  *merge;
188 
189   PetscFunctionBegin;
190   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
191   if (container) {
192     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
193   } else {
194     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
195   }
196   /* compute symbolic C_seq = P_loc^T * A_loc * P_seq */
197   /*--------------------------------------------------*/
198   /* get data from P_loc and P_oth */
199   P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart;
200   p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
201   pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
202   prend = prstart + pm;
203 
204   /* get ij structure of P_loc^T */
205   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
206   ptJ=ptj;
207 
208   /* Allocate ci array, arrays for fill computation and */
209   /* free space for accumulating nonzero column info */
210   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
211   ci[0] = 0;
212 
213   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
214   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
215   ptasparserow_loc = ptadenserow_loc + aN;
216   ptadenserow_oth  = ptasparserow_loc + aN;
217   ptasparserow_oth = ptadenserow_oth + aN;
218 
219   /* create and initialize a linked list */
220   nlnk = pN+1;
221   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
222 
223   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
224   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
225   nnz           = adi[am] + aoi[am];
226   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
227   current_space = free_space;
228 
229   /* determine symbolic info for each row of C: */
230   for (i=0;i<pN;i++) {
231     ptnzi  = pti[i+1] - pti[i];
232     nprow_loc = 0; nprow_oth = 0;
233     /* i-th row of symbolic P_loc^T*A_loc: */
234     for (j=0;j<ptnzi;j++) {
235       arow = *ptJ++;
236       /* diagonal portion of A */
237       anzj = adi[arow+1] - adi[arow];
238       ajj  = adj + adi[arow];
239       for (k=0;k<anzj;k++) {
240         col = ajj[k]+prstart;
241         if (!ptadenserow_loc[col]) {
242           ptadenserow_loc[col]    = -1;
243           ptasparserow_loc[nprow_loc++] = col;
244         }
245       }
246       /* off-diagonal portion of A */
247       anzj = aoi[arow+1] - aoi[arow];
248       ajj  = aoj + aoi[arow];
249       for (k=0;k<anzj;k++) {
250         col = a->garray[ajj[k]];  /* global col */
251         if (col < cstart){
252           col = ajj[k];
253         } else if (col >= cend){
254           col = ajj[k] + pm;
255         } else {
256           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
257         }
258         if (!ptadenserow_oth[col]) {
259           ptadenserow_oth[col]    = -1;
260           ptasparserow_oth[nprow_oth++] = col;
261         }
262       }
263     }
264 
265     /* using symbolic info of local PtA, determine symbolic info for row of C: */
266     cnzi   = 0;
267     /* rows of P_loc */
268     ptaj = ptasparserow_loc;
269     for (j=0; j<nprow_loc; j++) {
270       prow = *ptaj++;
271       prow -= prstart; /* rm */
272       pnzj = pi_loc[prow+1] - pi_loc[prow];
273       pjj  = pj_loc + pi_loc[prow];
274       /* add non-zero cols of P into the sorted linked list lnk */
275       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
276       cnzi += nlnk;
277     }
278     /* rows of P_oth */
279     ptaj = ptasparserow_oth;
280     for (j=0; j<nprow_oth; j++) {
281       prow = *ptaj++;
282       if (prow >= prend) prow -= pm; /* rm */
283       pnzj = pi_oth[prow+1] - pi_oth[prow];
284       pjj  = pj_oth + pi_oth[prow];
285       /* add non-zero cols of P into the sorted linked list lnk */
286       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
287       cnzi += nlnk;
288     }
289 
290     /* If free space is not available, make more free space */
291     /* Double the amount of total space in the list */
292     if (current_space->local_remaining<cnzi) {
293       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
294     }
295 
296     /* Copy data into free space, and zero out denserows */
297     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
298     current_space->array           += cnzi;
299     current_space->local_used      += cnzi;
300     current_space->local_remaining -= cnzi;
301 
302     for (j=0;j<nprow_loc; j++) {
303       ptadenserow_loc[ptasparserow_loc[j]] = 0;
304     }
305     for (j=0;j<nprow_oth; j++) {
306       ptadenserow_oth[ptasparserow_oth[j]] = 0;
307     }
308 
309     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
310     /*        For now, we will recompute what is needed. */
311     ci[i+1] = ci[i] + cnzi;
312   }
313   /* Clean up. */
314   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
315 
316   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
317   /* Allocate space for cj, initialize cj, and */
318   /* destroy list of free space and other temporary array(s) */
319   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
320   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
321   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
322   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
323 
324   /* Allocate space for ca */
325   /*
326   ierr = PetscMalloc((ci[pN]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
327   ierr = PetscMemzero(ca,(ci[pN]+1)*sizeof(MatScalar));CHKERRQ(ierr);
328   */
329 
330   /* add C_seq into mpi C              */
331   /*-----------------------------------*/
332   free_space=PETSC_NULL; current_space=PETSC_NULL;
333 
334   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
335   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
336 
337   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
338 
339 
340   /* determine row ownership */
341   /*---------------------------------------------------------*/
342   ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr);
343   ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr);
344   ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr);
345   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
346 
347   /* determine the number of messages to send, their lengths */
348   /*---------------------------------------------------------*/
349   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
350   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
351   len_s  = merge->len_s;
352   len = 0;  /* length of buf_si[] */
353   merge->nsend = 0;
354   for (proc=0; proc<size; proc++){
355     len_si[proc] = 0;
356     if (proc == rank){
357       len_s[proc] = 0;
358     } else {
359       len_si[proc] = owners[proc+1] - owners[proc] + 1;
360       len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of rows to be sent to [proc] */
361     }
362     if (len_s[proc]) {
363       merge->nsend++;
364       nrows = 0;
365       for (i=owners[proc]; i<owners[proc+1]; i++){
366         if (ci[i+1] > ci[i]) nrows++;
367       }
368       len_si[proc] = 2*(nrows+1);
369       len += len_si[proc];
370     }
371   }
372 
373   /* determine the number and length of messages to receive for ij-structure */
374   /*-------------------------------------------------------------------------*/
375   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
376   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
377 
378   /* post the Irecv of j-structure */
379   /*-------------------------------*/
380   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr);
381   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr);
382 
383   /* post the Isend of j-structure */
384   /*--------------------------------*/
385   ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr);
386   sj_waits = si_waits + merge->nsend;
387 
388   for (proc=0, k=0; proc<size; proc++){
389     if (!len_s[proc]) continue;
390     i = owners[proc];
391     ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr);
392     k++;
393   }
394 
395   /* receives and sends of j-structure are complete */
396   /*------------------------------------------------*/
397   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
398   ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr);
399   ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr);
400 
401   /* send and recv i-structure */
402   /*---------------------------*/
403   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr);
404   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr);
405 
406   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
407   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
408   for (proc=0,k=0; proc<size; proc++){
409     if (!len_s[proc]) continue;
410     /* form outgoing message for i-structure:
411          buf_si[0]:                 nrows to be sent
412                [1:nrows]:           row index (global)
413                [nrows+1:2*nrows+1]: i-structure index
414     */
415     /*-------------------------------------------*/
416     nrows = len_si[proc]/2 - 1;
417     buf_si_i    = buf_si + nrows+1;
418     buf_si[0]   = nrows;
419     buf_si_i[0] = 0;
420     nrows = 0;
421     for (i=owners[proc]; i<owners[proc+1]; i++){
422       anzi = ci[i+1] - ci[i];
423       if (anzi) {
424         buf_si_i[nrows+1] = buf_si_i[nrows] + anzi; /* i-structure */
425         buf_si[nrows+1] = i-owners[proc]; /* local row index */
426         nrows++;
427       }
428     }
429     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr);
430     k++;
431     buf_si += len_si[proc];
432   }
433 
434   ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr);
435   ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr);
436 
437   ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr);
438   for (i=0; i<merge->nrecv; i++){
439     ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI:   recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr);
440   }
441 
442   ierr = PetscFree(len_si);CHKERRQ(ierr);
443   ierr = PetscFree(len_ri);CHKERRQ(ierr);
444   ierr = PetscFree(rj_waits);CHKERRQ(ierr);
445   ierr = PetscFree(si_waits);CHKERRQ(ierr);
446   ierr = PetscFree(ri_waits);CHKERRQ(ierr);
447   ierr = PetscFree(buf_s);CHKERRQ(ierr);
448   ierr = PetscFree(status);CHKERRQ(ierr);
449 
450   /* compute a local seq matrix in each processor */
451   /*----------------------------------------------*/
452   /* allocate bi array and free space for accumulating nonzero column info */
453   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
454   bi[0] = 0;
455 
456   /* create and initialize a linked list */
457   nlnk = pN+1;
458   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
459 
460   /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */
461   len = 0;
462   len  = ci[owners[rank+1]] - ci[owners[rank]];
463   ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr);
464   current_space = free_space;
465 
466   /* determine symbolic info for each local row */
467   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
468   nextrow = buf_ri_k + merge->nrecv;
469   nextci  = nextrow + merge->nrecv;
470   for (k=0; k<merge->nrecv; k++){
471     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
472     nrows = *buf_ri_k[k];
473     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
474     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
475   }
476 
477   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
478   len = 0;
479   for (i=0; i<pn; i++) {
480     bnzi   = 0;
481     /* add local non-zero cols of this proc's C_seq into lnk */
482     arow   = owners[rank] + i;
483     anzi   = ci[arow+1] - ci[arow];
484     cji    = cj + ci[arow];
485     ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
486     bnzi += nlnk;
487     /* add received col data into lnk */
488     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
489       if (i == *nextrow[k]) { /* i-th row */
490         anzi = *(nextci[k]+1) - *nextci[k];
491         cji  = buf_rj[k] + *nextci[k];
492         ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
493         bnzi += nlnk;
494         nextrow[k]++; nextci[k]++;
495       }
496     }
497     if (len < bnzi) len = bnzi;  /* =max(bnzi) */
498 
499     /* if free space is not available, make more free space */
500     if (current_space->local_remaining<bnzi) {
501       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
502       nspacedouble++;
503     }
504     /* copy data into free space, then initialize lnk */
505     ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
506     ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr);
507 
508     current_space->array           += bnzi;
509     current_space->local_used      += bnzi;
510     current_space->local_remaining -= bnzi;
511 
512     bi[i+1] = bi[i] + bnzi;
513   }
514 
515   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
516 
517   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
518   ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
519   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
520 
521   /* create symbolic parallel matrix B_mpi */
522   /*---------------------------------------*/
523   ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr);
524   ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr);
525   ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr);
526   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
527   ierr = MatSetOption(B_mpi,MAT_COLUMNS_SORTED);CHKERRQ(ierr);
528 
529   /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */
530   B_mpi->assembled     = PETSC_FALSE;
531   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_SeqsToMPI;
532   merge->bi            = bi;
533   merge->bj            = bj;
534   merge->ci            = ci;
535   merge->cj            = cj;
536   merge->buf_ri        = buf_ri;
537   merge->buf_rj        = buf_rj;
538   merge->C_seq         = PETSC_NULL;
539 
540   /* attach the supporting struct to B_mpi for reuse */
541   ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
542   ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr);
543   ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr);
544   *C = B_mpi;
545 
546   PetscFunctionReturn(0);
547 }
548 
549 #undef __FUNCT__
550 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
551 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
552 {
553   PetscErrorCode       ierr;
554   Mat_Merge_SeqsToMPI  *merge;
555   Mat_MatMatMultMPI    *ptap;
556   PetscObjectContainer cont_merge,cont_ptap;
557 
558   PetscInt       flops=0;
559   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
560   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
561   Mat_SeqAIJ     *p_loc,*p_oth;
562   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col;
563   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pjj;
564   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
565   PetscInt       *cjj;
566   MatScalar      *ada=ad->a,*aoa=ao->a,*apa,*paj,*cseqa,*caj;
567   MatScalar      *pa_loc,*pA,*pa_oth;
568   PetscInt       am=A->m,cN=C->N;
569 
570   MPI_Comm             comm=C->comm;
571   PetscMPIInt          size,rank,taga,*len_s;
572   PetscInt             *owners;
573   PetscInt             proc;
574   PetscInt             **buf_ri,**buf_rj;
575   PetscInt             cseqnzi,*bj_i,*bi,*bj,cseqrow,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */
576   PetscInt             nrows,**buf_ri_k,**nextrow,**nextcseqi;
577   MPI_Request          *s_waits,*r_waits;
578   MPI_Status           *status;
579   MatScalar            **abuf_r,*ba_i;
580   PetscInt             *cseqi,*cseqj;
581   PetscInt             *cseqj_tmp;
582   MatScalar            *cseqa_tmp;
583   PetscInt             stages[3];
584 
585   PetscFunctionBegin;
586   ierr = PetscLogStageRegister(&stages[0],"NumAP_local");CHKERRQ(ierr);
587   ierr = PetscLogStageRegister(&stages[1],"NumPtAP_local");CHKERRQ(ierr);
588   ierr = PetscLogStageRegister(&stages[2],"NumPtAP_Comm");CHKERRQ(ierr);
589 
590   ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr);
591   if (cont_merge) {
592     ierr  = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr);
593   } else {
594     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
595   }
596   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr);
597   if (cont_ptap) {
598     ierr  = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr);
599   } else {
600     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
601   }
602   /* get data from symbolic products */
603   p_loc=(Mat_SeqAIJ*)(ptap->B_loc)->data;
604   p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data;
605   pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a; pA=pa_loc;
606   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
607 
608   cseqi = merge->ci; cseqj=merge->cj;
609   ierr  = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr);
610   ierr  = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
611 
612   /* Allocate temporary array for storage of one row of A*P */
613   ierr = PetscMalloc(cN*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
614   ierr = PetscMemzero(apa,cN*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
615   apj      = (PetscInt *)(apa + cN);
616   apjdense = apj + cN;
617 
618   /* Clear old values in C_Seq */
619   ierr = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
620 
621   for (i=0;i<am;i++) {
622     ierr = PetscLogStagePush(stages[0]);CHKERRQ(ierr);
623     /* Form i-th sparse row of A*P */
624      apnzj = 0;
625     /* diagonal portion of A */
626     anzi  = adi[i+1] - adi[i];
627     for (j=0;j<anzi;j++) {
628       prow = *adj;
629       adj++;
630       pnzj = pi_loc[prow+1] - pi_loc[prow];
631       pjj  = pj_loc + pi_loc[prow];
632       paj  = pa_loc + pi_loc[prow];
633       for (k=0;k<pnzj;k++) {
634         if (!apjdense[pjj[k]]) {
635           apjdense[pjj[k]] = -1;
636           apj[apnzj++]     = pjj[k];
637         }
638         apa[pjj[k]] += (*ada)*paj[k];
639       }
640       flops += 2*pnzj;
641       ada++;
642     }
643     /* off-diagonal portion of A */
644     anzi  = aoi[i+1] - aoi[i];
645     for (j=0;j<anzi;j++) {
646       col = a->garray[*aoj];
647       if (col < cstart){
648         prow = *aoj;
649       } else if (col >= cend){
650         prow = *aoj;
651       } else {
652         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
653       }
654       aoj++;
655       pnzj = pi_oth[prow+1] - pi_oth[prow];
656       pjj  = pj_oth + pi_oth[prow];
657       paj  = pa_oth + pi_oth[prow];
658       for (k=0;k<pnzj;k++) {
659         if (!apjdense[pjj[k]]) {
660           apjdense[pjj[k]] = -1;
661           apj[apnzj++]     = pjj[k];
662         }
663         apa[pjj[k]] += (*aoa)*paj[k];
664       }
665       flops += 2*pnzj;
666       aoa++;
667     }
668     /* Sort the j index array for quick sparse axpy. */
669     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
670     ierr = PetscLogStagePop();
671 
672     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
673     ierr = PetscLogStagePush(stages[1]);
674     pnzi = pi_loc[i+1] - pi_loc[i];
675     for (j=0;j<pnzi;j++) {
676       nextap = 0;
677       crow   = *pJ++;
678       cjj    = cseqj + cseqi[crow];
679       caj    = cseqa + cseqi[crow];
680       /* Perform sparse axpy operation.  Note cjj includes apj. */
681       for (k=0;nextap<apnzj;k++) {
682         if (cjj[k]==apj[nextap]) {
683           caj[k] += (*pA)*apa[apj[nextap++]];
684         }
685       }
686       flops += 2*apnzj;
687       pA++;
688     }
689 
690     /* Zero the current row info for A*P */
691     for (j=0;j<apnzj;j++) {
692       apa[apj[j]]      = 0.;
693       apjdense[apj[j]] = 0;
694     }
695     ierr = PetscLogStagePop();
696   }
697 
698   ierr = PetscFree(apa);CHKERRQ(ierr);
699 
700   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
701   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
702 
703   bi     = merge->bi;
704   bj     = merge->bj;
705   buf_ri = merge->buf_ri;
706   buf_rj = merge->buf_rj;
707 
708   ierr   = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
709   ierr   = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
710 
711   /* send and recv matrix values */
712   /*-----------------------------*/
713   ierr = PetscLogStagePush(stages[2]);CHKERRQ(ierr);
714   len_s  = merge->len_s;
715   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr);
716   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
717 
718   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr);
719   for (proc=0,k=0; proc<size; proc++){
720     if (!len_s[proc]) continue;
721     i = owners[proc];
722     ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
723     k++;
724   }
725 
726   ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);
727   ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);
728   ierr = PetscFree(status);CHKERRQ(ierr);
729 
730   ierr = PetscFree(s_waits);CHKERRQ(ierr);
731   ierr = PetscFree(r_waits);CHKERRQ(ierr);
732 
733   /* insert mat values of mpimat */
734   /*----------------------------*/
735   ierr = PetscMalloc(cN*sizeof(MatScalar),&ba_i);CHKERRQ(ierr);
736   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
737   nextrow = buf_ri_k + merge->nrecv;
738   nextcseqi  = nextrow + merge->nrecv;
739 
740   for (k=0; k<merge->nrecv; k++){
741     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
742     nrows = *(buf_ri_k[k]);
743     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
744     nextcseqi[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
745   }
746 
747   /* set values of ba */
748   for (i=0; i<C->m; i++) {
749     cseqrow = owners[rank] + i; /* global row index of C_seq */
750     bj_i = bj+bi[i];  /* col indices of the i-th row of C */
751     bnzi = bi[i+1] - bi[i];
752     ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr);
753 
754     /* add local non-zero vals of this proc's C_seq into ba */
755     cseqnzi = cseqi[cseqrow+1] - cseqi[cseqrow];
756     cseqj_tmp = cseqj + cseqi[cseqrow];
757     cseqa_tmp = cseqa + cseqi[cseqrow];
758     nextcseqj = 0;
759     for (j=0; nextcseqj<cseqnzi; j++){
760       if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
761         ba_i[j] += cseqa_tmp[nextcseqj++];
762       }
763     }
764 
765     /* add received vals into ba */
766     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
767       /* i-th row */
768       if (i == *nextrow[k]) {
769         cseqnzi   = *(nextcseqi[k]+1) - *nextcseqi[k];
770         cseqj_tmp = buf_rj[k] + *(nextcseqi[k]);
771         cseqa_tmp = abuf_r[k] + *(nextcseqi[k]);
772         nextcseqj = 0;
773         for (j=0; nextcseqj<cseqnzi; j++){
774           if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
775             ba_i[j] += cseqa_tmp[nextcseqj++];
776           }
777         }
778         nextrow[k]++; nextcseqi[k]++;
779       }
780     }
781     ierr = MatSetValues(C,1,&cseqrow,bnzi,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
782     flops += 2*bnzi;
783   }
784   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
785   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
786   ierr = PetscLogStagePop();CHKERRQ(ierr);
787 
788   ierr = PetscFree(cseqa);CHKERRQ(ierr);
789   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
790   ierr = PetscFree(ba_i);CHKERRQ(ierr);
791   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
792   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
793 
794   PetscFunctionReturn(0);
795 }
796 
797 #undef __FUNCT__
798 #define __FUNCT__ "MatPtAP_SeqAIJ_SeqAIJ"
799 PetscErrorCode MatPtAP_SeqAIJ_SeqAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
800 {
801   PetscErrorCode ierr;
802 
803   PetscFunctionBegin;
804   if (scall == MAT_INITIAL_MATRIX){
805     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
806     ierr = MatPtAPSymbolic_SeqAIJ_SeqAIJ(A,P,fill,C);CHKERRQ(ierr);
807     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
808   }
809   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
810   ierr = MatPtAPNumeric_SeqAIJ_SeqAIJ(A,P,*C);CHKERRQ(ierr);
811   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
812   PetscFunctionReturn(0);
813 }
814 
815 /*
816    MatPtAPSymbolic - Creates the (i,j) structure of the matrix projection C = P^T * A * P
817 
818    Collective on Mat
819 
820    Input Parameters:
821 +  A - the matrix
822 -  P - the projection matrix
823 
824    Output Parameters:
825 .  C - the (i,j) structure of the product matrix
826 
827    Notes:
828    C will be created and must be destroyed by the user with MatDestroy().
829 
830    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
831    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.  The product is computed using
832    this (i,j) structure by calling MatPtAPNumeric().
833 
834    Level: intermediate
835 
836 .seealso: MatPtAP(),MatPtAPNumeric(),MatMatMultSymbolic()
837 */
838 #undef __FUNCT__
839 #define __FUNCT__ "MatPtAPSymbolic"
840 PetscErrorCode MatPtAPSymbolic(Mat A,Mat P,PetscReal fill,Mat *C)
841 {
842   PetscErrorCode ierr;
843   PetscErrorCode (*fA)(Mat,Mat,PetscReal,Mat*);
844   PetscErrorCode (*fP)(Mat,Mat,PetscReal,Mat*);
845 
846   PetscFunctionBegin;
847 
848   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
849   PetscValidType(A,1);
850   MatPreallocated(A);
851   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
852   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
853 
854   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
855   PetscValidType(P,2);
856   MatPreallocated(P);
857   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
858   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
859 
860   PetscValidPointer(C,3);
861 
862   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
863   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
864 
865   /* For now, we do not dispatch based on the type of A and P */
866   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
867   fA = A->ops->ptapsymbolic;
868   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for A of type %s",A->type_name);
869   fP = P->ops->ptapsymbolic;
870   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for P of type %s",P->type_name);
871   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPSymbolic requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
872 
873   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
874   ierr = (*fA)(A,P,fill,C);CHKERRQ(ierr);
875   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
876 
877   PetscFunctionReturn(0);
878 }
879 
880 typedef struct {
881   Mat    symAP;
882 } Mat_PtAPstruct;
883 
884 EXTERN PetscErrorCode MatDestroy_SeqAIJ(Mat);
885 
886 #undef __FUNCT__
887 #define __FUNCT__ "MatDestroy_SeqAIJ_PtAP"
888 PetscErrorCode MatDestroy_SeqAIJ_PtAP(Mat A)
889 {
890   PetscErrorCode    ierr;
891   Mat_PtAPstruct    *ptap=(Mat_PtAPstruct*)A->spptr;
892 
893   PetscFunctionBegin;
894   ierr = MatDestroy(ptap->symAP);CHKERRQ(ierr);
895   ierr = PetscFree(ptap);CHKERRQ(ierr);
896   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
897   PetscFunctionReturn(0);
898 }
899 
900 #undef __FUNCT__
901 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqAIJ"
902 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
903 {
904   PetscErrorCode ierr;
905   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
906   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*p = (Mat_SeqAIJ*)P->data,*c;
907   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
908   PetscInt       *ci,*cj,*ptadenserow,*ptasparserow,*ptaj;
909   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M;
910   PetscInt       i,j,k,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi,nlnk,*lnk;
911   MatScalar      *ca;
912   PetscBT        lnkbt;
913 
914   PetscFunctionBegin;
915   /* Get ij structure of P^T */
916   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
917   ptJ=ptj;
918 
919   /* Allocate ci array, arrays for fill computation and */
920   /* free space for accumulating nonzero column info */
921   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
922   ci[0] = 0;
923 
924   ierr = PetscMalloc((2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
925   ierr = PetscMemzero(ptadenserow,(2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
926   ptasparserow = ptadenserow  + an;
927 
928   /* create and initialize a linked list */
929   nlnk = pn+1;
930   ierr = PetscLLCreate(pn,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
931 
932   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
933   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
934   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
935   current_space = free_space;
936 
937   /* Determine symbolic info for each row of C: */
938   for (i=0;i<pn;i++) {
939     ptnzi  = pti[i+1] - pti[i];
940     ptanzi = 0;
941     /* Determine symbolic row of PtA: */
942     for (j=0;j<ptnzi;j++) {
943       arow = *ptJ++;
944       anzj = ai[arow+1] - ai[arow];
945       ajj  = aj + ai[arow];
946       for (k=0;k<anzj;k++) {
947         if (!ptadenserow[ajj[k]]) {
948           ptadenserow[ajj[k]]    = -1;
949           ptasparserow[ptanzi++] = ajj[k];
950         }
951       }
952     }
953       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
954     ptaj = ptasparserow;
955     cnzi   = 0;
956     for (j=0;j<ptanzi;j++) {
957       prow = *ptaj++;
958       pnzj = pi[prow+1] - pi[prow];
959       pjj  = pj + pi[prow];
960       /* add non-zero cols of P into the sorted linked list lnk */
961       ierr = PetscLLAdd(pnzj,pjj,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
962       cnzi += nlnk;
963     }
964 
965     /* If free space is not available, make more free space */
966     /* Double the amount of total space in the list */
967     if (current_space->local_remaining<cnzi) {
968       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
969     }
970 
971     /* Copy data into free space, and zero out denserows */
972     ierr = PetscLLClean(pn,pn,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
973     current_space->array           += cnzi;
974     current_space->local_used      += cnzi;
975     current_space->local_remaining -= cnzi;
976 
977     for (j=0;j<ptanzi;j++) {
978       ptadenserow[ptasparserow[j]] = 0;
979     }
980     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
981     /*        For now, we will recompute what is needed. */
982     ci[i+1] = ci[i] + cnzi;
983   }
984   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
985   /* Allocate space for cj, initialize cj, and */
986   /* destroy list of free space and other temporary array(s) */
987   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
988   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
989   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
990   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
991 
992   /* Allocate space for ca */
993   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
994   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
995 
996   /* put together the new matrix */
997   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
998 
999   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1000   /* Since these are PETSc arrays, change flags to free them as necessary. */
1001   c = (Mat_SeqAIJ *)((*C)->data);
1002   c->freedata = PETSC_TRUE;
1003   c->nonew    = 0;
1004 
1005   /* Clean up. */
1006   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1007 
1008   PetscFunctionReturn(0);
1009 }
1010 
1011 #include "src/mat/impls/maij/maij.h"
1012 EXTERN_C_BEGIN
1013 #undef __FUNCT__
1014 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqMAIJ"
1015 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqMAIJ(Mat A,Mat PP,Mat *C)
1016 {
1017   /* This routine requires testing -- I don't think it works. */
1018   PetscErrorCode ierr;
1019   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1020   Mat_SeqMAIJ    *pp=(Mat_SeqMAIJ*)PP->data;
1021   Mat            P=pp->AIJ;
1022   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*p=(Mat_SeqAIJ*)P->data,*c;
1023   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
1024   PetscInt       *ci,*cj,*denserow,*sparserow,*ptadenserow,*ptasparserow,*ptaj;
1025   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M,ppdof=pp->dof;
1026   PetscInt       i,j,k,dof,pdof,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi;
1027   MatScalar      *ca;
1028 
1029   PetscFunctionBegin;
1030   /* Start timer */
1031   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1032 
1033   /* Get ij structure of P^T */
1034   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1035 
1036   /* Allocate ci array, arrays for fill computation and */
1037   /* free space for accumulating nonzero column info */
1038   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1039   ci[0] = 0;
1040 
1041   ierr = PetscMalloc((2*pn+2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
1042   ierr = PetscMemzero(ptadenserow,(2*pn+2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
1043   ptasparserow = ptadenserow  + an;
1044   denserow     = ptasparserow + an;
1045   sparserow    = denserow     + pn;
1046 
1047   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
1048   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1049   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
1050   current_space = free_space;
1051 
1052   /* Determine symbolic info for each row of C: */
1053   for (i=0;i<pn/ppdof;i++) {
1054     ptnzi  = pti[i+1] - pti[i];
1055     ptanzi = 0;
1056     ptJ    = ptj + pti[i];
1057     for (dof=0;dof<ppdof;dof++) {
1058     /* Determine symbolic row of PtA: */
1059       for (j=0;j<ptnzi;j++) {
1060         arow = ptJ[j] + dof;
1061         anzj = ai[arow+1] - ai[arow];
1062         ajj  = aj + ai[arow];
1063         for (k=0;k<anzj;k++) {
1064           if (!ptadenserow[ajj[k]]) {
1065             ptadenserow[ajj[k]]    = -1;
1066             ptasparserow[ptanzi++] = ajj[k];
1067           }
1068         }
1069       }
1070       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
1071       ptaj = ptasparserow;
1072       cnzi   = 0;
1073       for (j=0;j<ptanzi;j++) {
1074         pdof = *ptaj%dof;
1075         prow = (*ptaj++)/dof;
1076         pnzj = pi[prow+1] - pi[prow];
1077         pjj  = pj + pi[prow];
1078         for (k=0;k<pnzj;k++) {
1079           if (!denserow[pjj[k]+pdof]) {
1080             denserow[pjj[k]+pdof] = -1;
1081             sparserow[cnzi++]     = pjj[k]+pdof;
1082           }
1083         }
1084       }
1085 
1086       /* sort sparserow */
1087       ierr = PetscSortInt(cnzi,sparserow);CHKERRQ(ierr);
1088 
1089       /* If free space is not available, make more free space */
1090       /* Double the amount of total space in the list */
1091       if (current_space->local_remaining<cnzi) {
1092         ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1093       }
1094 
1095       /* Copy data into free space, and zero out denserows */
1096       ierr = PetscMemcpy(current_space->array,sparserow,cnzi*sizeof(PetscInt));CHKERRQ(ierr);
1097       current_space->array           += cnzi;
1098       current_space->local_used      += cnzi;
1099       current_space->local_remaining -= cnzi;
1100 
1101       for (j=0;j<ptanzi;j++) {
1102         ptadenserow[ptasparserow[j]] = 0;
1103       }
1104       for (j=0;j<cnzi;j++) {
1105         denserow[sparserow[j]] = 0;
1106       }
1107       /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1108       /*        For now, we will recompute what is needed. */
1109       ci[i+1+dof] = ci[i+dof] + cnzi;
1110     }
1111   }
1112   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1113   /* Allocate space for cj, initialize cj, and */
1114   /* destroy list of free space and other temporary array(s) */
1115   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1116   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1117   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
1118 
1119   /* Allocate space for ca */
1120   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1121   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1122 
1123   /* put together the new matrix */
1124   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
1125 
1126   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1127   /* Since these are PETSc arrays, change flags to free them as necessary. */
1128   c = (Mat_SeqAIJ *)((*C)->data);
1129   c->freedata = PETSC_TRUE;
1130   c->nonew    = 0;
1131 
1132   /* Clean up. */
1133   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1134 
1135   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1136   PetscFunctionReturn(0);
1137 }
1138 EXTERN_C_END
1139 
1140 /*
1141    MatPtAPNumeric - Computes the matrix projection C = P^T * A * P
1142 
1143    Collective on Mat
1144 
1145    Input Parameters:
1146 +  A - the matrix
1147 -  P - the projection matrix
1148 
1149    Output Parameters:
1150 .  C - the product matrix
1151 
1152    Notes:
1153    C must have been created by calling MatPtAPSymbolic and must be destroyed by
1154    the user using MatDeatroy().
1155 
1156    This routine is currently only implemented for pairs of AIJ matrices and classes
1157    which inherit from AIJ.  C will be of type MATAIJ.
1158 
1159    Level: intermediate
1160 
1161 .seealso: MatPtAP(),MatPtAPSymbolic(),MatMatMultNumeric()
1162 */
1163 #undef __FUNCT__
1164 #define __FUNCT__ "MatPtAPNumeric"
1165 PetscErrorCode MatPtAPNumeric(Mat A,Mat P,Mat C)
1166 {
1167   PetscErrorCode ierr;
1168   PetscErrorCode (*fA)(Mat,Mat,Mat);
1169   PetscErrorCode (*fP)(Mat,Mat,Mat);
1170 
1171   PetscFunctionBegin;
1172 
1173   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
1174   PetscValidType(A,1);
1175   MatPreallocated(A);
1176   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1177   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1178 
1179   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
1180   PetscValidType(P,2);
1181   MatPreallocated(P);
1182   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1183   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1184 
1185   PetscValidHeaderSpecific(C,MAT_COOKIE,3);
1186   PetscValidType(C,3);
1187   MatPreallocated(C);
1188   if (!C->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1189   if (C->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1190 
1191   if (P->N!=C->M) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->M);
1192   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
1193   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
1194   if (P->N!=C->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->N);
1195 
1196   /* For now, we do not dispatch based on the type of A and P */
1197   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
1198   fA = A->ops->ptapnumeric;
1199   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for A of type %s",A->type_name);
1200   fP = P->ops->ptapnumeric;
1201   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for P of type %s",P->type_name);
1202   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPNumeric requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
1203 
1204   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1205   ierr = (*fA)(A,P,C);CHKERRQ(ierr);
1206   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1207 
1208   PetscFunctionReturn(0);
1209 }
1210 
1211 #undef __FUNCT__
1212 #define __FUNCT__ "MatPtAPNumeric_SeqAIJ_SeqAIJ"
1213 PetscErrorCode MatPtAPNumeric_SeqAIJ_SeqAIJ(Mat A,Mat P,Mat C)
1214 {
1215   PetscErrorCode ierr;
1216   PetscInt       flops=0;
1217   Mat_SeqAIJ     *a  = (Mat_SeqAIJ *) A->data;
1218   Mat_SeqAIJ     *p  = (Mat_SeqAIJ *) P->data;
1219   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
1220   PetscInt       *ai=a->i,*aj=a->j,*apj,*apjdense,*pi=p->i,*pj=p->j,*pJ=p->j,*pjj;
1221   PetscInt       *ci=c->i,*cj=c->j,*cjj;
1222   PetscInt       am=A->M,cn=C->N,cm=C->M;
1223   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1224   MatScalar      *aa=a->a,*apa,*pa=p->a,*pA=p->a,*paj,*ca=c->a,*caj;
1225 
1226   PetscFunctionBegin;
1227   /* Allocate temporary array for storage of one row of A*P */
1228   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1229   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
1230 
1231   apj      = (PetscInt *)(apa + cn);
1232   apjdense = apj + cn;
1233 
1234   /* Clear old values in C */
1235   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1236 
1237   for (i=0;i<am;i++) {
1238     /* Form sparse row of A*P */
1239     anzi  = ai[i+1] - ai[i];
1240     apnzj = 0;
1241     for (j=0;j<anzi;j++) {
1242       prow = *aj++;
1243       pnzj = pi[prow+1] - pi[prow];
1244       pjj  = pj + pi[prow];
1245       paj  = pa + pi[prow];
1246       for (k=0;k<pnzj;k++) {
1247         if (!apjdense[pjj[k]]) {
1248           apjdense[pjj[k]] = -1;
1249           apj[apnzj++]     = pjj[k];
1250         }
1251         apa[pjj[k]] += (*aa)*paj[k];
1252       }
1253       flops += 2*pnzj;
1254       aa++;
1255     }
1256 
1257     /* Sort the j index array for quick sparse axpy. */
1258     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1259 
1260     /* Compute P^T*A*P using outer product (P^T)[:,j]*(A*P)[j,:]. */
1261     pnzi = pi[i+1] - pi[i];
1262     for (j=0;j<pnzi;j++) {
1263       nextap = 0;
1264       crow   = *pJ++;
1265       cjj    = cj + ci[crow];
1266       caj    = ca + ci[crow];
1267       /* Perform sparse axpy operation.  Note cjj includes apj. */
1268       for (k=0;nextap<apnzj;k++) {
1269         if (cjj[k]==apj[nextap]) {
1270           caj[k] += (*pA)*apa[apj[nextap++]];
1271         }
1272       }
1273       flops += 2*apnzj;
1274       pA++;
1275     }
1276 
1277     /* Zero the current row info for A*P */
1278     for (j=0;j<apnzj;j++) {
1279       apa[apj[j]]      = 0.;
1280       apjdense[apj[j]] = 0;
1281     }
1282   }
1283 
1284   /* Assemble the final matrix and clean up */
1285   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1286   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1287   ierr = PetscFree(apa);CHKERRQ(ierr);
1288   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1289 
1290   PetscFunctionReturn(0);
1291 }
1292 
1293 /* Compute local C = P_loc^T * A * P - used by MatPtAP_MPIAIJ_MPIAIJ() */
1294 static PetscEvent logkey_matptapnumeric_local = 0;
1295 #undef __FUNCT__
1296 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ_Local"
1297 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,Mat C)
1298 {
1299   PetscErrorCode ierr;
1300   PetscInt       flops=0;
1301   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
1302   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1303   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
1304   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
1305   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
1306   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col;
1307   PetscInt       *pJ=pj_loc,*pjj;
1308   PetscInt       *ci=c->i,*cj=c->j,*cjj;
1309   PetscInt       am=A->m,cn=C->N,cm=C->M;
1310   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1311   MatScalar      *ada=ad->a,*aoa=ao->a,*apa,*paj,*ca=c->a,*caj;
1312   MatScalar      *pa_loc=p_loc->a,*pA=pa_loc,*pa_oth=p_oth->a;
1313 
1314   PetscFunctionBegin;
1315   if (!logkey_matptapnumeric_local) {
1316     ierr = PetscLogEventRegister(&logkey_matptapnumeric_local,"MatPtAPNumeric_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);CHKERRQ(ierr);
1317   }
1318   ierr = PetscLogEventBegin(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
1319 
1320   /* Allocate temporary array for storage of one row of A*P */
1321   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1322   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
1323   apj      = (PetscInt *)(apa + cn);
1324   apjdense = apj + cn;
1325 
1326   /* Clear old values in C */
1327   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1328 
1329   for (i=0;i<am;i++) {
1330     /* Form i-th sparse row of A*P */
1331      apnzj = 0;
1332     /* diagonal portion of A */
1333     anzi  = adi[i+1] - adi[i];
1334     for (j=0;j<anzi;j++) {
1335       prow = *adj;
1336       adj++;
1337       pnzj = pi_loc[prow+1] - pi_loc[prow];
1338       pjj  = pj_loc + pi_loc[prow];
1339       paj  = pa_loc + pi_loc[prow];
1340       for (k=0;k<pnzj;k++) {
1341         if (!apjdense[pjj[k]]) {
1342           apjdense[pjj[k]] = -1;
1343           apj[apnzj++]     = pjj[k];
1344         }
1345         apa[pjj[k]] += (*ada)*paj[k];
1346       }
1347       flops += 2*pnzj;
1348       ada++;
1349     }
1350     /* off-diagonal portion of A */
1351     anzi  = aoi[i+1] - aoi[i];
1352     for (j=0;j<anzi;j++) {
1353       col = a->garray[*aoj];
1354       if (col < cstart){
1355         prow = *aoj;
1356       } else if (col >= cend){
1357         prow = *aoj;
1358       } else {
1359         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
1360       }
1361       aoj++;
1362       pnzj = pi_oth[prow+1] - pi_oth[prow];
1363       pjj  = pj_oth + pi_oth[prow];
1364       paj  = pa_oth + pi_oth[prow];
1365       for (k=0;k<pnzj;k++) {
1366         if (!apjdense[pjj[k]]) {
1367           apjdense[pjj[k]] = -1;
1368           apj[apnzj++]     = pjj[k];
1369         }
1370         apa[pjj[k]] += (*aoa)*paj[k];
1371       }
1372       flops += 2*pnzj;
1373       aoa++;
1374     }
1375     /* Sort the j index array for quick sparse axpy. */
1376     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1377 
1378     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
1379     pnzi = pi_loc[i+1] - pi_loc[i];
1380     for (j=0;j<pnzi;j++) {
1381       nextap = 0;
1382       crow   = *pJ++;
1383       cjj    = cj + ci[crow];
1384       caj    = ca + ci[crow];
1385       /* Perform sparse axpy operation.  Note cjj includes apj. */
1386       for (k=0;nextap<apnzj;k++) {
1387         if (cjj[k]==apj[nextap]) {
1388           caj[k] += (*pA)*apa[apj[nextap++]];
1389         }
1390       }
1391       flops += 2*apnzj;
1392       pA++;
1393     }
1394 
1395     /* Zero the current row info for A*P */
1396     for (j=0;j<apnzj;j++) {
1397       apa[apj[j]]      = 0.;
1398       apjdense[apj[j]] = 0;
1399     }
1400   }
1401 
1402   /* Assemble the final matrix and clean up */
1403   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1404   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1405   ierr = PetscFree(apa);CHKERRQ(ierr);
1406   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1407   ierr = PetscLogEventEnd(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
1408   PetscFunctionReturn(0);
1409 }
1410 static PetscEvent logkey_matptapsymbolic_local = 0;
1411 #undef __FUNCT__
1412 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local"
1413 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,PetscReal fill,Mat *C)
1414 {
1415   PetscErrorCode ierr;
1416   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1417   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
1418   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*c;
1419   PetscInt       *pti,*ptj,*ptJ,*ajj,*pjj;
1420   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
1421   PetscInt       *ci,*cj,*ptaj;
1422   PetscInt       aN=A->N,am=A->m,pN=P_loc->N;
1423   PetscInt       i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi;
1424   PetscInt       pm=P_loc->m,nlnk,*lnk;
1425   MatScalar      *ca;
1426   PetscBT        lnkbt;
1427   PetscInt       prend,nprow_loc,nprow_oth;
1428   PetscInt       *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
1429   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
1430   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
1431 
1432   PetscFunctionBegin;
1433   if (!logkey_matptapsymbolic_local) {
1434     ierr = PetscLogEventRegister(&logkey_matptapsymbolic_local,"MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);
1435   }
1436   ierr = PetscLogEventBegin(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
1437 
1438   prend = prstart + pm;
1439 
1440   /* get ij structure of P_loc^T */
1441   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1442   ptJ=ptj;
1443 
1444   /* Allocate ci array, arrays for fill computation and */
1445   /* free space for accumulating nonzero column info */
1446   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1447   ci[0] = 0;
1448 
1449   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
1450   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
1451   ptasparserow_loc = ptadenserow_loc + aN;
1452   ptadenserow_oth  = ptasparserow_loc + aN;
1453   ptasparserow_oth = ptadenserow_oth + aN;
1454 
1455   /* create and initialize a linked list */
1456   nlnk = pN+1;
1457   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1458 
1459   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
1460   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1461   nnz           = adi[am] + aoi[am];
1462   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
1463   current_space = free_space;
1464 
1465   /* determine symbolic info for each row of C: */
1466   for (i=0;i<pN;i++) {
1467     ptnzi  = pti[i+1] - pti[i];
1468     nprow_loc = 0; nprow_oth = 0;
1469     /* i-th row of symbolic P_loc^T*A_loc: */
1470     for (j=0;j<ptnzi;j++) {
1471       arow = *ptJ++;
1472       /* diagonal portion of A */
1473       anzj = adi[arow+1] - adi[arow];
1474       ajj  = adj + adi[arow];
1475       for (k=0;k<anzj;k++) {
1476         col = ajj[k]+prstart;
1477         if (!ptadenserow_loc[col]) {
1478           ptadenserow_loc[col]    = -1;
1479           ptasparserow_loc[nprow_loc++] = col;
1480         }
1481       }
1482       /* off-diagonal portion of A */
1483       anzj = aoi[arow+1] - aoi[arow];
1484       ajj  = aoj + aoi[arow];
1485       for (k=0;k<anzj;k++) {
1486         col = a->garray[ajj[k]];  /* global col */
1487         if (col < cstart){
1488           col = ajj[k];
1489         } else if (col >= cend){
1490           col = ajj[k] + pm;
1491         } else {
1492           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
1493         }
1494         if (!ptadenserow_oth[col]) {
1495           ptadenserow_oth[col]    = -1;
1496           ptasparserow_oth[nprow_oth++] = col;
1497         }
1498       }
1499     }
1500 
1501     /* using symbolic info of local PtA, determine symbolic info for row of C: */
1502     cnzi   = 0;
1503     /* rows of P_loc */
1504     ptaj = ptasparserow_loc;
1505     for (j=0; j<nprow_loc; j++) {
1506       prow = *ptaj++;
1507       prow -= prstart; /* rm */
1508       pnzj = pi_loc[prow+1] - pi_loc[prow];
1509       pjj  = pj_loc + pi_loc[prow];
1510       /* add non-zero cols of P into the sorted linked list lnk */
1511       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1512       cnzi += nlnk;
1513     }
1514     /* rows of P_oth */
1515     ptaj = ptasparserow_oth;
1516     for (j=0; j<nprow_oth; j++) {
1517       prow = *ptaj++;
1518       if (prow >= prend) prow -= pm; /* rm */
1519       pnzj = pi_oth[prow+1] - pi_oth[prow];
1520       pjj  = pj_oth + pi_oth[prow];
1521       /* add non-zero cols of P into the sorted linked list lnk */
1522       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1523       cnzi += nlnk;
1524     }
1525 
1526     /* If free space is not available, make more free space */
1527     /* Double the amount of total space in the list */
1528     if (current_space->local_remaining<cnzi) {
1529       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1530     }
1531 
1532     /* Copy data into free space, and zero out denserows */
1533     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1534     current_space->array           += cnzi;
1535     current_space->local_used      += cnzi;
1536     current_space->local_remaining -= cnzi;
1537 
1538     for (j=0;j<nprow_loc; j++) {
1539       ptadenserow_loc[ptasparserow_loc[j]] = 0;
1540     }
1541     for (j=0;j<nprow_oth; j++) {
1542       ptadenserow_oth[ptasparserow_oth[j]] = 0;
1543     }
1544 
1545     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1546     /*        For now, we will recompute what is needed. */
1547     ci[i+1] = ci[i] + cnzi;
1548   }
1549   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1550   /* Allocate space for cj, initialize cj, and */
1551   /* destroy list of free space and other temporary array(s) */
1552   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1553   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1554   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
1555   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1556 
1557   /* Allocate space for ca */
1558   ierr = PetscMalloc((ci[pN]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1559   ierr = PetscMemzero(ca,(ci[pN]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1560 
1561   /* put together the new matrix */
1562   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,pN,pN,ci,cj,ca,C);CHKERRQ(ierr);
1563 
1564   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1565   /* Since these are PETSc arrays, change flags to free them as necessary. */
1566   c = (Mat_SeqAIJ *)((*C)->data);
1567   c->freedata = PETSC_TRUE;
1568   c->nonew    = 0;
1569 
1570   /* Clean up. */
1571   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1572 
1573   ierr = PetscLogEventEnd(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
1574   PetscFunctionReturn(0);
1575 }
1576