xref: /petsc/src/mat/impls/aij/seq/matptap.c (revision 6853509dcb8dde923a8c3398271653a56500a9cf)
1 /*
2   Defines projective product routines where A is a AIJ matrix
3           C = P^T * A * P
4 */
5 
6 #include "src/mat/impls/aij/seq/aij.h"   /*I "petscmat.h" I*/
7 #include "src/mat/utils/freespace.h"
8 #include "src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatPtAP"
13 /*@
14    MatPtAP - Creates the matrix projection C = P^T * A * P
15 
16    Collective on Mat
17 
18    Input Parameters:
19 +  A - the matrix
20 .  P - the projection matrix
21 .  scall - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
22 -  fill - expected fill as ratio of nnz(C)/(nnz(A) + nnz(P))
23 
24    Output Parameters:
25 .  C - the product matrix
26 
27    Notes:
28    C will be created and must be destroyed by the user with MatDestroy().
29 
30    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
31    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.
32 
33    Level: intermediate
34 
35 .seealso: MatPtAPSymbolic(),MatPtAPNumeric(),MatMatMult()
36 @*/
37 PetscErrorCode MatPtAP(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
38 {
39   PetscErrorCode ierr;
40   PetscErrorCode (*fA)(Mat,Mat,MatReuse,PetscReal,Mat *);
41   PetscErrorCode (*fP)(Mat,Mat,MatReuse,PetscReal,Mat *);
42 
43   PetscFunctionBegin;
44   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
45   PetscValidType(A,1);
46   MatPreallocated(A);
47   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
48   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
49   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
50   PetscValidType(P,2);
51   MatPreallocated(P);
52   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
53   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
54   PetscValidPointer(C,3);
55 
56   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
57 
58   if (fill <=0.0) SETERRQ1(PETSC_ERR_ARG_SIZ,"fill=%g must be > 0.0",fill);
59 
60   /* For now, we do not dispatch based on the type of A and P */
61   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
62   fA = A->ops->ptap;
63   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for A of type %s",A->type_name);
64   fP = P->ops->ptap;
65   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for P of type %s",P->type_name);
66   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAP requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
67 
68   ierr = PetscLogEventBegin(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
69   ierr = (*fA)(A,P,scall,fill,C);CHKERRQ(ierr);
70   ierr = PetscLogEventEnd(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
71   PetscFunctionReturn(0);
72 }
73 
74 EXTERN PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,PetscReal,Mat*);
75 EXTERN PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,Mat C);
76 
77 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
78 #undef __FUNCT__
79 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
80 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
81 {
82   PetscErrorCode       ierr;
83   Mat_MatMatMultMPI    *ptap;
84   PetscObjectContainer container;
85 
86   PetscFunctionBegin;
87   ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
88   if (container) {
89     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
90     ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr);
91     ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr);
92     ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr);
93     ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr);
94     if (ptap->abi) ierr = PetscFree(ptap->abi);CHKERRQ(ierr);
95     if (ptap->abj) ierr = PetscFree(ptap->abj);CHKERRQ(ierr);
96 
97     ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
98     ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr);
99   }
100   ierr = PetscFree(ptap);CHKERRQ(ierr);
101 
102   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
103   PetscFunctionReturn(0);
104 }
105 
106 #undef __FUNCT__
107 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
108 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
109 {
110   PetscErrorCode       ierr;
111   Mat_MatMatMultMPI    *ptap;
112   PetscObjectContainer container;
113 
114   PetscFunctionBegin;
115   if (scall == MAT_INITIAL_MATRIX){
116     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
117     ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr);
118     ptap->abi=PETSC_NULL; ptap->abj=PETSC_NULL;
119     ptap->abnz_max = 0; /* symbolic A*P is not done yet */
120 
121     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
122     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
123 
124     /* get P_loc by taking all local rows of P */
125     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
126 
127     /* attach the supporting struct to P for reuse */
128     P->ops->destroy  = MatDestroy_MPIAIJ_PtAP;
129     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
130     ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr);
131     ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr);
132 
133     /* now, compute symbolic local P^T*A*P */
134     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */
135     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
136   } else if (scall == MAT_REUSE_MATRIX){
137     ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
138     if (container) {
139       ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
140     } else {
141       SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
142     }
143 
144     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
145     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
146 
147     /* get P_loc by taking all local rows of P */
148     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
149 
150   } else {
151     SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall);
152   }
153   /* now, compute numeric local P^T*A*P */
154   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
155   ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
156   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
157 
158   PetscFunctionReturn(0);
159 }
160 
161 #undef __FUNCT__
162 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
163 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
164 {
165   PetscErrorCode       ierr;
166   Mat                  P_loc,P_oth;
167   Mat_MatMatMultMPI    *ptap;
168   PetscObjectContainer container;
169   FreeSpaceList        free_space=PETSC_NULL,current_space=PETSC_NULL;
170   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data;
171   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
172   Mat_SeqAIJ           *p_loc,*p_oth;
173   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*ajj,*pjj;
174   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
175   PetscInt             pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji,*ptaj;
176   PetscInt             aN=A->N,am=A->m,pN=P->N;
177   PetscInt             i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi;
178   PetscBT              lnkbt;
179   PetscInt             prstart,prend,nprow_loc,nprow_oth;
180   PetscInt             *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
181 
182   MPI_Comm             comm=A->comm;
183   Mat                  B_mpi;
184   PetscMPIInt          size,rank,tagi,tagj,*len_s,*len_si,*len_ri;
185   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
186   PetscInt             len,proc,*dnz,*onz,*owners;
187   PetscInt             anzi,*bi,*bj,bnzi,nspacedouble=0;
188   PetscInt             nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
189   MPI_Request          *si_waits,*sj_waits,*ri_waits,*rj_waits;
190   MPI_Status           *status;
191   Mat_Merge_SeqsToMPI  *merge;
192 
193   PetscFunctionBegin;
194   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
195   if (container) {
196     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
197   } else {
198     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
199   }
200   /* compute symbolic C_seq = P_loc^T * A_loc * P_seq */
201   /*--------------------------------------------------*/
202   /* get data from P_loc and P_oth */
203   P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart;
204   p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
205   pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
206   prend = prstart + pm;
207 
208   /* get ij structure of P_loc^T */
209   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
210   ptJ=ptj;
211 
212   /* Allocate ci array, arrays for fill computation and */
213   /* free space for accumulating nonzero column info */
214   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
215   ci[0] = 0;
216 
217   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
218   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
219   ptasparserow_loc = ptadenserow_loc + aN;
220   ptadenserow_oth  = ptasparserow_loc + aN;
221   ptasparserow_oth = ptadenserow_oth + aN;
222 
223   /* create and initialize a linked list */
224   nlnk = pN+1;
225   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
226 
227   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
228   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
229   nnz           = adi[am] + aoi[am];
230   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
231   current_space = free_space;
232 
233   /* determine symbolic info for each row of C: */
234   for (i=0;i<pN;i++) {
235     ptnzi  = pti[i+1] - pti[i];
236     nprow_loc = 0; nprow_oth = 0;
237     /* i-th row of symbolic P_loc^T*A_loc: */
238     for (j=0;j<ptnzi;j++) {
239       arow = *ptJ++;
240       /* diagonal portion of A */
241       anzj = adi[arow+1] - adi[arow];
242       ajj  = adj + adi[arow];
243       for (k=0;k<anzj;k++) {
244         col = ajj[k]+prstart;
245         if (!ptadenserow_loc[col]) {
246           ptadenserow_loc[col]    = -1;
247           ptasparserow_loc[nprow_loc++] = col;
248         }
249       }
250       /* off-diagonal portion of A */
251       anzj = aoi[arow+1] - aoi[arow];
252       ajj  = aoj + aoi[arow];
253       for (k=0;k<anzj;k++) {
254         col = a->garray[ajj[k]];  /* global col */
255         if (col < cstart){
256           col = ajj[k];
257         } else if (col >= cend){
258           col = ajj[k] + pm;
259         } else {
260           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
261         }
262         if (!ptadenserow_oth[col]) {
263           ptadenserow_oth[col]    = -1;
264           ptasparserow_oth[nprow_oth++] = col;
265         }
266       }
267     }
268 
269     /* using symbolic info of local PtA, determine symbolic info for row of C: */
270     cnzi   = 0;
271     /* rows of P_loc */
272     ptaj = ptasparserow_loc;
273     for (j=0; j<nprow_loc; j++) {
274       prow = *ptaj++;
275       prow -= prstart; /* rm */
276       pnzj = pi_loc[prow+1] - pi_loc[prow];
277       pjj  = pj_loc + pi_loc[prow];
278       /* add non-zero cols of P into the sorted linked list lnk */
279       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
280       cnzi += nlnk;
281     }
282     /* rows of P_oth */
283     ptaj = ptasparserow_oth;
284     for (j=0; j<nprow_oth; j++) {
285       prow = *ptaj++;
286       if (prow >= prend) prow -= pm; /* rm */
287       pnzj = pi_oth[prow+1] - pi_oth[prow];
288       pjj  = pj_oth + pi_oth[prow];
289       /* add non-zero cols of P into the sorted linked list lnk */
290       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
291       cnzi += nlnk;
292     }
293 
294     /* If free space is not available, make more free space */
295     /* Double the amount of total space in the list */
296     if (current_space->local_remaining<cnzi) {
297       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
298     }
299 
300     /* Copy data into free space, and zero out denserows */
301     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
302     current_space->array           += cnzi;
303     current_space->local_used      += cnzi;
304     current_space->local_remaining -= cnzi;
305 
306     for (j=0;j<nprow_loc; j++) {
307       ptadenserow_loc[ptasparserow_loc[j]] = 0;
308     }
309     for (j=0;j<nprow_oth; j++) {
310       ptadenserow_oth[ptasparserow_oth[j]] = 0;
311     }
312 
313     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
314     /*        For now, we will recompute what is needed. */
315     ci[i+1] = ci[i] + cnzi;
316   }
317   /* Clean up. */
318   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
319 
320   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
321   /* Allocate space for cj, initialize cj, and */
322   /* destroy list of free space and other temporary array(s) */
323   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
324   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
325   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
326   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
327 
328   /* Allocate space for ca */
329   /*
330   ierr = PetscMalloc((ci[pN]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
331   ierr = PetscMemzero(ca,(ci[pN]+1)*sizeof(MatScalar));CHKERRQ(ierr);
332   */
333 
334   /* add C_seq into mpi C              */
335   /*-----------------------------------*/
336   free_space=PETSC_NULL; current_space=PETSC_NULL;
337 
338   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
339   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
340 
341   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
342 
343 
344   /* determine row ownership */
345   /*---------------------------------------------------------*/
346   ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr);
347   ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr);
348   ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr);
349   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
350 
351   /* determine the number of messages to send, their lengths */
352   /*---------------------------------------------------------*/
353   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
354   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
355   len_s  = merge->len_s;
356   len = 0;  /* length of buf_si[] */
357   merge->nsend = 0;
358   for (proc=0; proc<size; proc++){
359     len_si[proc] = 0;
360     if (proc == rank){
361       len_s[proc] = 0;
362     } else {
363       len_si[proc] = owners[proc+1] - owners[proc] + 1;
364       len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of rows to be sent to [proc] */
365     }
366     if (len_s[proc]) {
367       merge->nsend++;
368       nrows = 0;
369       for (i=owners[proc]; i<owners[proc+1]; i++){
370         if (ci[i+1] > ci[i]) nrows++;
371       }
372       len_si[proc] = 2*(nrows+1);
373       len += len_si[proc];
374     }
375   }
376 
377   /* determine the number and length of messages to receive for ij-structure */
378   /*-------------------------------------------------------------------------*/
379   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
380   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
381 
382   /* post the Irecv of j-structure */
383   /*-------------------------------*/
384   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr);
385   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr);
386 
387   /* post the Isend of j-structure */
388   /*--------------------------------*/
389   ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr);
390   sj_waits = si_waits + merge->nsend;
391 
392   for (proc=0, k=0; proc<size; proc++){
393     if (!len_s[proc]) continue;
394     i = owners[proc];
395     ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr);
396     k++;
397   }
398 
399   /* receives and sends of j-structure are complete */
400   /*------------------------------------------------*/
401   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
402   ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr);
403   ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr);
404 
405   /* send and recv i-structure */
406   /*---------------------------*/
407   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr);
408   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr);
409 
410   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
411   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
412   for (proc=0,k=0; proc<size; proc++){
413     if (!len_s[proc]) continue;
414     /* form outgoing message for i-structure:
415          buf_si[0]:                 nrows to be sent
416                [1:nrows]:           row index (global)
417                [nrows+1:2*nrows+1]: i-structure index
418     */
419     /*-------------------------------------------*/
420     nrows = len_si[proc]/2 - 1;
421     buf_si_i    = buf_si + nrows+1;
422     buf_si[0]   = nrows;
423     buf_si_i[0] = 0;
424     nrows = 0;
425     for (i=owners[proc]; i<owners[proc+1]; i++){
426       anzi = ci[i+1] - ci[i];
427       if (anzi) {
428         buf_si_i[nrows+1] = buf_si_i[nrows] + anzi; /* i-structure */
429         buf_si[nrows+1] = i-owners[proc]; /* local row index */
430         nrows++;
431       }
432     }
433     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr);
434     k++;
435     buf_si += len_si[proc];
436   }
437 
438   ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr);
439   ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr);
440 
441   ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr);
442   for (i=0; i<merge->nrecv; i++){
443     ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI:   recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr);
444   }
445 
446   ierr = PetscFree(len_si);CHKERRQ(ierr);
447   ierr = PetscFree(len_ri);CHKERRQ(ierr);
448   ierr = PetscFree(rj_waits);CHKERRQ(ierr);
449   ierr = PetscFree(si_waits);CHKERRQ(ierr);
450   ierr = PetscFree(ri_waits);CHKERRQ(ierr);
451   ierr = PetscFree(buf_s);CHKERRQ(ierr);
452   ierr = PetscFree(status);CHKERRQ(ierr);
453 
454   /* compute a local seq matrix in each processor */
455   /*----------------------------------------------*/
456   /* allocate bi array and free space for accumulating nonzero column info */
457   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
458   bi[0] = 0;
459 
460   /* create and initialize a linked list */
461   nlnk = pN+1;
462   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
463 
464   /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */
465   len = 0;
466   len  = ci[owners[rank+1]] - ci[owners[rank]];
467   ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr);
468   current_space = free_space;
469 
470   /* determine symbolic info for each local row */
471   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
472   nextrow = buf_ri_k + merge->nrecv;
473   nextci  = nextrow + merge->nrecv;
474   for (k=0; k<merge->nrecv; k++){
475     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
476     nrows = *buf_ri_k[k];
477     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
478     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
479   }
480 
481   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
482   len = 0;
483   for (i=0; i<pn; i++) {
484     bnzi   = 0;
485     /* add local non-zero cols of this proc's C_seq into lnk */
486     arow   = owners[rank] + i;
487     anzi   = ci[arow+1] - ci[arow];
488     cji    = cj + ci[arow];
489     ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
490     bnzi += nlnk;
491     /* add received col data into lnk */
492     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
493       if (i == *nextrow[k]) { /* i-th row */
494         anzi = *(nextci[k]+1) - *nextci[k];
495         cji  = buf_rj[k] + *nextci[k];
496         ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
497         bnzi += nlnk;
498         nextrow[k]++; nextci[k]++;
499       }
500     }
501     if (len < bnzi) len = bnzi;  /* =max(bnzi) */
502 
503     /* if free space is not available, make more free space */
504     if (current_space->local_remaining<bnzi) {
505       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
506       nspacedouble++;
507     }
508     /* copy data into free space, then initialize lnk */
509     ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
510     ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr);
511 
512     current_space->array           += bnzi;
513     current_space->local_used      += bnzi;
514     current_space->local_remaining -= bnzi;
515 
516     bi[i+1] = bi[i] + bnzi;
517   }
518 
519   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
520 
521   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
522   ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
523   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
524 
525   /* create symbolic parallel matrix B_mpi */
526   /*---------------------------------------*/
527   ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr);
528   ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr);
529   ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr);
530   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
531   /* ierr = MatSetOption(B_mpi,MAT_COLUMNS_SORTED);CHKERRQ(ierr); -cause delay? */
532 
533   /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */
534   B_mpi->assembled     = PETSC_FALSE;
535   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_SeqsToMPI;
536   merge->bi            = bi;
537   merge->bj            = bj;
538   merge->ci            = ci;
539   merge->cj            = cj;
540   merge->buf_ri        = buf_ri;
541   merge->buf_rj        = buf_rj;
542   merge->C_seq         = PETSC_NULL;
543 
544   /* attach the supporting struct to B_mpi for reuse */
545   ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
546   ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr);
547   ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr);
548   *C = B_mpi;
549 
550   PetscFunctionReturn(0);
551 }
552 
553 #undef __FUNCT__
554 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
555 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
556 {
557   PetscErrorCode       ierr;
558   Mat_Merge_SeqsToMPI  *merge;
559   Mat_MatMatMultMPI    *ptap;
560   PetscObjectContainer cont_merge,cont_ptap;
561 
562   PetscInt       flops=0;
563   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
564   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
565   Mat_SeqAIJ     *p_loc,*p_oth;
566   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col;
567   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pjj;
568   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
569   PetscInt       *cjj;
570   MatScalar      *ada=ad->a,*aoa=ao->a,*ada_tmp=ad->a,*aoa_tmp=ao->a,*apa,*paj,*cseqa,*caj;
571   MatScalar      *pa_loc,*pA,*pa_oth;
572   PetscInt       am=A->m,cN=C->N;
573   PetscInt       nextp,*adj_tmp=ad->j,*aoj_tmp=ao->j;
574 
575   MPI_Comm             comm=C->comm;
576   PetscMPIInt          size,rank,taga,*len_s;
577   PetscInt             *owners;
578   PetscInt             proc;
579   PetscInt             **buf_ri,**buf_rj;
580   PetscInt             cseqnzi,*bj_i,*bi,*bj,cseqrow,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */
581   PetscInt             nrows,**buf_ri_k,**nextrow,**nextcseqi;
582   MPI_Request          *s_waits,*r_waits;
583   MPI_Status           *status;
584   MatScalar            **abuf_r,*ba_i;
585   PetscInt             *cseqi,*cseqj;
586   PetscInt             *cseqj_tmp;
587   MatScalar            *cseqa_tmp;
588   PetscInt             stages[2];
589   PetscInt             *apsymi,*apsymj;
590   PetscTruth           HasSymAP;
591   FreeSpaceList        free_space=PETSC_NULL,current_space=PETSC_NULL;
592 
593   PetscFunctionBegin;
594   ierr = PetscLogStageRegister(&stages[0],"NumPtAP_local");CHKERRQ(ierr);
595   ierr = PetscLogStageRegister(&stages[1],"NumPtAP_Comm");CHKERRQ(ierr);
596 
597   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
598   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
599 
600   ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr);
601   if (cont_merge) {
602     ierr  = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr);
603   } else {
604     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
605   }
606   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr);
607   if (cont_ptap) {
608     ierr  = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr);
609   } else {
610     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
611   }
612 
613   /* set or get data from symbolic A*P */
614   if (!ptap->abnz_max) {
615     HasSymAP = PETSC_FALSE;
616     ierr  = PetscMalloc((am+2)*sizeof(PetscInt),&apsymi);CHKERRQ(ierr);
617     ptap->abi = apsymi;
618     apsymi[0] = 0;
619     ierr = PetscMalloc(cN*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
620     ierr = PetscMemzero(apa,cN*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
621     apj  = (PetscInt *)(apa + cN);
622     apjdense = apj + cN;
623     /* Initial FreeSpace size is 2*nnz(A) */
624     ierr = GetMoreSpace((PetscInt)(2*(adi[am]+aoi[am])),&free_space);CHKERRQ(ierr);
625     current_space = free_space;
626   } else {
627     HasSymAP = PETSC_TRUE;
628     ierr = PetscMalloc((ptap->abnz_max+1)*sizeof(MatScalar),&apa);CHKERRQ(ierr);
629     ierr = PetscMemzero(apa,ptap->abnz_max*sizeof(MatScalar));CHKERRQ(ierr);
630     apsymi = ptap->abi; apsymj = ptap->abj;
631   }
632 
633   /* get data from symbolic products */
634   p_loc=(Mat_SeqAIJ*)(ptap->B_loc)->data;
635   p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data;
636   pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a; pA=pa_loc;
637   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
638 
639   cseqi = merge->ci; cseqj=merge->cj;
640   ierr  = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr);
641   ierr  = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
642 
643   ierr = PetscLogStagePush(stages[0]);CHKERRQ(ierr);
644   for (i=0;i<am;i++) {
645     /* Form i-th sparse row of A*P */
646     if (!HasSymAP){ /* Compute symbolic i-th sparse row of A*P */
647       apnzj = 0;
648       /* diagonal portion of A */
649       anzi  = adi[i+1] - adi[i];
650       for (j=0;j<anzi;j++) {
651         prow = *adj;
652         adj++;
653         pnzj = pi_loc[prow+1] - pi_loc[prow];
654         pjj  = pj_loc + pi_loc[prow];
655         paj  = pa_loc + pi_loc[prow];
656         for (k=0;k<pnzj;k++) {
657           if (!apjdense[pjj[k]]) {
658             apjdense[pjj[k]] = -1;
659             apj[apnzj++]     = pjj[k];
660           }
661         }
662         flops += 2*pnzj;
663         ada++;
664       }
665       /* off-diagonal portion of A */
666       anzi  = aoi[i+1] - aoi[i];
667       for (j=0;j<anzi;j++) {
668         col = a->garray[*aoj];
669         if (col < cstart){
670           prow = *aoj;
671         } else if (col >= cend){
672           prow = *aoj;
673         } else {
674           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
675         }
676         aoj++;
677         pnzj = pi_oth[prow+1] - pi_oth[prow];
678         pjj  = pj_oth + pi_oth[prow];
679         paj  = pa_oth + pi_oth[prow];
680         for (k=0;k<pnzj;k++) {
681           if (!apjdense[pjj[k]]) {
682             apjdense[pjj[k]] = -1;
683             apj[apnzj++]     = pjj[k];
684           }
685         }
686         flops += 2*pnzj;
687         aoa++;
688       }
689       /* Sort the j index array for quick sparse axpy. */
690       ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
691 
692       apsymi[i+1] = apsymi[i] + apnzj;
693       if (ptap->abnz_max < apnzj) ptap->abnz_max = apnzj;
694 
695       /* If free space is not available, make more free space */
696       /* Double the amount of total space in the list */
697       if (current_space->local_remaining<apnzj) {
698         ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
699       }
700 
701       /* Copy data into free space, then initialize lnk */
702       /* ierr = PetscLLClean(bn,bn,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr); */
703       for (j=0; j<apnzj; j++) current_space->array[j] = apj[j];
704       current_space->array           += apnzj;
705       current_space->local_used      += apnzj;
706       current_space->local_remaining -= apnzj;
707 
708     } else { /* get symbolic AP[i,:] */
709       apnzj = apsymi[i+1] - apsymi[i];
710       apj   = apsymj + apsymi[i];
711     }
712 
713     /* use symbolic AP[i,:] to form a condensed array apa */
714     /* diagonal portion of A */
715     anzi  = adi[i+1] - adi[i];
716     for (j=0;j<anzi;j++) {
717       prow = *adj_tmp;
718       adj_tmp++;
719       pnzj = pi_loc[prow+1] - pi_loc[prow];
720       pjj  = pj_loc + pi_loc[prow];
721       paj  = pa_loc + pi_loc[prow];
722       nextp = 0;
723       for (k=0; nextp<pnzj; k++) {
724         if (apj[k] == pjj[nextp]) { /* col of AP == col of P */
725           apa[k] += (*ada_tmp)*paj[nextp++];
726         }
727       }
728       ada_tmp++;
729     }
730     /* off-diagonal portion of A */
731     anzi  = aoi[i+1] - aoi[i];
732     for (j=0;j<anzi;j++) {
733       col = a->garray[*aoj_tmp];
734       if (col < cstart){
735         prow = *aoj_tmp;
736       } else if (col >= cend){
737         prow = *aoj_tmp;
738       } else {
739         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
740       }
741       aoj_tmp++;
742       pnzj = pi_oth[prow+1] - pi_oth[prow];
743       pjj  = pj_oth + pi_oth[prow];
744       paj  = pa_oth + pi_oth[prow];
745       nextp = 0;
746       for (k=0; nextp<pnzj; k++) {
747         if (apj[k] == pjj[nextp]) { /* col of AP == col of P */
748           apa[k] += (*aoa_tmp)*paj[nextp++];
749         }
750       }
751       flops += 2*pnzj;
752       aoa_tmp++;
753     }
754 
755     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
756     pnzi = pi_loc[i+1] - pi_loc[i];
757     for (j=0;j<pnzi;j++) {
758       nextap = 0;
759       crow   = *pJ++;
760       cjj    = cseqj + cseqi[crow];
761       caj    = cseqa + cseqi[crow];
762       /* Perform sparse axpy operation.  Note cjj includes apj. */
763       for (k=0;nextap<apnzj;k++) {
764         if (cjj[k]==apj[nextap]) caj[k] += (*pA)*apa[nextap++];
765       }
766 
767       flops += 2*apnzj;
768       pA++;
769     }
770 
771     /* Zero the current row info for A*P */
772     for (j=0;j<apnzj;j++) {
773       if (!HasSymAP){
774         apjdense[apj[j]] = 0;
775       }
776       apa[j] = 0;
777     }
778   }
779   if (!HasSymAP){
780   /* Column indices of AP are in the list of free space */
781   /* Allocate space for apsymj, initialize apsymj, and */
782   /* destroy list of free space and other temporary array(s) */
783     ierr = PetscMalloc((apsymi[am]+1)*sizeof(PetscInt),&ptap->abj);CHKERRQ(ierr);
784     ierr = MakeSpaceContiguous(&free_space,ptap->abj);CHKERRQ(ierr);
785   }
786   ierr = PetscFree(apa);CHKERRQ(ierr);
787   ierr = PetscLogStagePop();
788 
789   bi     = merge->bi;
790   bj     = merge->bj;
791   buf_ri = merge->buf_ri;
792   buf_rj = merge->buf_rj;
793 
794   ierr   = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
795   ierr   = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
796 
797   /* send and recv matrix values */
798   /*-----------------------------*/
799   ierr = PetscLogStagePush(stages[1]);CHKERRQ(ierr);
800   len_s  = merge->len_s;
801   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr);
802   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
803 
804   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr);
805   for (proc=0,k=0; proc<size; proc++){
806     if (!len_s[proc]) continue;
807     i = owners[proc];
808     ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
809     k++;
810   }
811 
812   ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);
813   ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);
814   ierr = PetscFree(status);CHKERRQ(ierr);
815 
816   ierr = PetscFree(s_waits);CHKERRQ(ierr);
817   ierr = PetscFree(r_waits);CHKERRQ(ierr);
818 
819   /* insert mat values of mpimat */
820   /*----------------------------*/
821   ierr = PetscMalloc(cN*sizeof(MatScalar),&ba_i);CHKERRQ(ierr);
822   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
823   nextrow = buf_ri_k + merge->nrecv;
824   nextcseqi  = nextrow + merge->nrecv;
825 
826   for (k=0; k<merge->nrecv; k++){
827     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
828     nrows = *(buf_ri_k[k]);
829     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
830     nextcseqi[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
831   }
832 
833   /* set values of ba */
834   for (i=0; i<C->m; i++) {
835     cseqrow = owners[rank] + i; /* global row index of C_seq */
836     bj_i = bj+bi[i];  /* col indices of the i-th row of C */
837     bnzi = bi[i+1] - bi[i];
838     ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr);
839 
840     /* add local non-zero vals of this proc's C_seq into ba */
841     cseqnzi = cseqi[cseqrow+1] - cseqi[cseqrow];
842     cseqj_tmp = cseqj + cseqi[cseqrow];
843     cseqa_tmp = cseqa + cseqi[cseqrow];
844     nextcseqj = 0;
845     for (j=0; nextcseqj<cseqnzi; j++){
846       if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
847         ba_i[j] += cseqa_tmp[nextcseqj++];
848       }
849     }
850 
851     /* add received vals into ba */
852     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
853       /* i-th row */
854       if (i == *nextrow[k]) {
855         cseqnzi   = *(nextcseqi[k]+1) - *nextcseqi[k];
856         cseqj_tmp = buf_rj[k] + *(nextcseqi[k]);
857         cseqa_tmp = abuf_r[k] + *(nextcseqi[k]);
858         nextcseqj = 0;
859         for (j=0; nextcseqj<cseqnzi; j++){
860           if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
861             ba_i[j] += cseqa_tmp[nextcseqj++];
862           }
863         }
864         nextrow[k]++; nextcseqi[k]++;
865       }
866     }
867     ierr = MatSetValues(C,1,&cseqrow,bnzi,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
868     flops += 2*bnzi;
869   }
870   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
871   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
872   ierr = PetscLogStagePop();CHKERRQ(ierr);
873 
874   ierr = PetscFree(cseqa);CHKERRQ(ierr);
875   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
876   ierr = PetscFree(ba_i);CHKERRQ(ierr);
877   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
878   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
879 
880   PetscFunctionReturn(0);
881 }
882 
883 #undef __FUNCT__
884 #define __FUNCT__ "MatPtAP_SeqAIJ_SeqAIJ"
885 PetscErrorCode MatPtAP_SeqAIJ_SeqAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
886 {
887   PetscErrorCode ierr;
888 
889   PetscFunctionBegin;
890   if (scall == MAT_INITIAL_MATRIX){
891     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
892     ierr = MatPtAPSymbolic_SeqAIJ_SeqAIJ(A,P,fill,C);CHKERRQ(ierr);
893     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
894   }
895   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
896   ierr = MatPtAPNumeric_SeqAIJ_SeqAIJ(A,P,*C);CHKERRQ(ierr);
897   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
898   PetscFunctionReturn(0);
899 }
900 
901 /*
902    MatPtAPSymbolic - Creates the (i,j) structure of the matrix projection C = P^T * A * P
903 
904    Collective on Mat
905 
906    Input Parameters:
907 +  A - the matrix
908 -  P - the projection matrix
909 
910    Output Parameters:
911 .  C - the (i,j) structure of the product matrix
912 
913    Notes:
914    C will be created and must be destroyed by the user with MatDestroy().
915 
916    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
917    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.  The product is computed using
918    this (i,j) structure by calling MatPtAPNumeric().
919 
920    Level: intermediate
921 
922 .seealso: MatPtAP(),MatPtAPNumeric(),MatMatMultSymbolic()
923 */
924 #undef __FUNCT__
925 #define __FUNCT__ "MatPtAPSymbolic"
926 PetscErrorCode MatPtAPSymbolic(Mat A,Mat P,PetscReal fill,Mat *C)
927 {
928   PetscErrorCode ierr;
929   PetscErrorCode (*fA)(Mat,Mat,PetscReal,Mat*);
930   PetscErrorCode (*fP)(Mat,Mat,PetscReal,Mat*);
931 
932   PetscFunctionBegin;
933 
934   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
935   PetscValidType(A,1);
936   MatPreallocated(A);
937   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
938   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
939 
940   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
941   PetscValidType(P,2);
942   MatPreallocated(P);
943   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
944   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
945 
946   PetscValidPointer(C,3);
947 
948   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
949   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
950 
951   /* For now, we do not dispatch based on the type of A and P */
952   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
953   fA = A->ops->ptapsymbolic;
954   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for A of type %s",A->type_name);
955   fP = P->ops->ptapsymbolic;
956   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for P of type %s",P->type_name);
957   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPSymbolic requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
958 
959   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
960   ierr = (*fA)(A,P,fill,C);CHKERRQ(ierr);
961   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
962 
963   PetscFunctionReturn(0);
964 }
965 #ifdef TOBEREMOVE
966 typedef struct {
967   Mat    symAP;
968 } Mat_PtAPstruct;
969 
970 EXTERN PetscErrorCode MatDestroy_SeqAIJ(Mat);
971 
972 #undef __FUNCT__
973 #define __FUNCT__ "MatDestroy_SeqAIJ_PtAP"
974 PetscErrorCode MatDestroy_SeqAIJ_PtAP(Mat A)
975 {
976   PetscErrorCode    ierr;
977   Mat_PtAPstruct    *ptap=(Mat_PtAPstruct*)A->spptr;
978 
979   PetscFunctionBegin;
980   ierr = MatDestroy(ptap->symAP);CHKERRQ(ierr);
981   ierr = PetscFree(ptap);CHKERRQ(ierr);
982   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
983   PetscFunctionReturn(0);
984 }
985 #endif /* TOBEREMOVE */
986 
987 #undef __FUNCT__
988 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqAIJ"
989 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
990 {
991   PetscErrorCode ierr;
992   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
993   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*p = (Mat_SeqAIJ*)P->data,*c;
994   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
995   PetscInt       *ci,*cj,*ptadenserow,*ptasparserow,*ptaj;
996   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M;
997   PetscInt       i,j,k,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi,nlnk,*lnk;
998   MatScalar      *ca;
999   PetscBT        lnkbt;
1000 
1001   PetscFunctionBegin;
1002   /* Get ij structure of P^T */
1003   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1004   ptJ=ptj;
1005 
1006   /* Allocate ci array, arrays for fill computation and */
1007   /* free space for accumulating nonzero column info */
1008   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1009   ci[0] = 0;
1010 
1011   ierr = PetscMalloc((2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
1012   ierr = PetscMemzero(ptadenserow,(2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
1013   ptasparserow = ptadenserow  + an;
1014 
1015   /* create and initialize a linked list */
1016   nlnk = pn+1;
1017   ierr = PetscLLCreate(pn,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1018 
1019   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
1020   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1021   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
1022   current_space = free_space;
1023 
1024   /* Determine symbolic info for each row of C: */
1025   for (i=0;i<pn;i++) {
1026     ptnzi  = pti[i+1] - pti[i];
1027     ptanzi = 0;
1028     /* Determine symbolic row of PtA: */
1029     for (j=0;j<ptnzi;j++) {
1030       arow = *ptJ++;
1031       anzj = ai[arow+1] - ai[arow];
1032       ajj  = aj + ai[arow];
1033       for (k=0;k<anzj;k++) {
1034         if (!ptadenserow[ajj[k]]) {
1035           ptadenserow[ajj[k]]    = -1;
1036           ptasparserow[ptanzi++] = ajj[k];
1037         }
1038       }
1039     }
1040       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
1041     ptaj = ptasparserow;
1042     cnzi   = 0;
1043     for (j=0;j<ptanzi;j++) {
1044       prow = *ptaj++;
1045       pnzj = pi[prow+1] - pi[prow];
1046       pjj  = pj + pi[prow];
1047       /* add non-zero cols of P into the sorted linked list lnk */
1048       ierr = PetscLLAdd(pnzj,pjj,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1049       cnzi += nlnk;
1050     }
1051 
1052     /* If free space is not available, make more free space */
1053     /* Double the amount of total space in the list */
1054     if (current_space->local_remaining<cnzi) {
1055       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1056     }
1057 
1058     /* Copy data into free space, and zero out denserows */
1059     ierr = PetscLLClean(pn,pn,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1060     current_space->array           += cnzi;
1061     current_space->local_used      += cnzi;
1062     current_space->local_remaining -= cnzi;
1063 
1064     for (j=0;j<ptanzi;j++) {
1065       ptadenserow[ptasparserow[j]] = 0;
1066     }
1067     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1068     /*        For now, we will recompute what is needed. */
1069     ci[i+1] = ci[i] + cnzi;
1070   }
1071   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1072   /* Allocate space for cj, initialize cj, and */
1073   /* destroy list of free space and other temporary array(s) */
1074   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1075   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1076   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
1077   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1078 
1079   /* Allocate space for ca */
1080   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1081   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1082 
1083   /* put together the new matrix */
1084   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
1085 
1086   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1087   /* Since these are PETSc arrays, change flags to free them as necessary. */
1088   c = (Mat_SeqAIJ *)((*C)->data);
1089   c->freedata = PETSC_TRUE;
1090   c->nonew    = 0;
1091 
1092   /* Clean up. */
1093   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1094 
1095   PetscFunctionReturn(0);
1096 }
1097 
1098 #include "src/mat/impls/maij/maij.h"
1099 EXTERN_C_BEGIN
1100 #undef __FUNCT__
1101 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqMAIJ"
1102 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqMAIJ(Mat A,Mat PP,Mat *C)
1103 {
1104   /* This routine requires testing -- I don't think it works. */
1105   PetscErrorCode ierr;
1106   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1107   Mat_SeqMAIJ    *pp=(Mat_SeqMAIJ*)PP->data;
1108   Mat            P=pp->AIJ;
1109   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*p=(Mat_SeqAIJ*)P->data,*c;
1110   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
1111   PetscInt       *ci,*cj,*denserow,*sparserow,*ptadenserow,*ptasparserow,*ptaj;
1112   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M,ppdof=pp->dof;
1113   PetscInt       i,j,k,dof,pdof,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi;
1114   MatScalar      *ca;
1115 
1116   PetscFunctionBegin;
1117   /* Start timer */
1118   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1119 
1120   /* Get ij structure of P^T */
1121   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1122 
1123   /* Allocate ci array, arrays for fill computation and */
1124   /* free space for accumulating nonzero column info */
1125   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1126   ci[0] = 0;
1127 
1128   ierr = PetscMalloc((2*pn+2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
1129   ierr = PetscMemzero(ptadenserow,(2*pn+2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
1130   ptasparserow = ptadenserow  + an;
1131   denserow     = ptasparserow + an;
1132   sparserow    = denserow     + pn;
1133 
1134   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
1135   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1136   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
1137   current_space = free_space;
1138 
1139   /* Determine symbolic info for each row of C: */
1140   for (i=0;i<pn/ppdof;i++) {
1141     ptnzi  = pti[i+1] - pti[i];
1142     ptanzi = 0;
1143     ptJ    = ptj + pti[i];
1144     for (dof=0;dof<ppdof;dof++) {
1145     /* Determine symbolic row of PtA: */
1146       for (j=0;j<ptnzi;j++) {
1147         arow = ptJ[j] + dof;
1148         anzj = ai[arow+1] - ai[arow];
1149         ajj  = aj + ai[arow];
1150         for (k=0;k<anzj;k++) {
1151           if (!ptadenserow[ajj[k]]) {
1152             ptadenserow[ajj[k]]    = -1;
1153             ptasparserow[ptanzi++] = ajj[k];
1154           }
1155         }
1156       }
1157       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
1158       ptaj = ptasparserow;
1159       cnzi   = 0;
1160       for (j=0;j<ptanzi;j++) {
1161         pdof = *ptaj%dof;
1162         prow = (*ptaj++)/dof;
1163         pnzj = pi[prow+1] - pi[prow];
1164         pjj  = pj + pi[prow];
1165         for (k=0;k<pnzj;k++) {
1166           if (!denserow[pjj[k]+pdof]) {
1167             denserow[pjj[k]+pdof] = -1;
1168             sparserow[cnzi++]     = pjj[k]+pdof;
1169           }
1170         }
1171       }
1172 
1173       /* sort sparserow */
1174       ierr = PetscSortInt(cnzi,sparserow);CHKERRQ(ierr);
1175 
1176       /* If free space is not available, make more free space */
1177       /* Double the amount of total space in the list */
1178       if (current_space->local_remaining<cnzi) {
1179         ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1180       }
1181 
1182       /* Copy data into free space, and zero out denserows */
1183       ierr = PetscMemcpy(current_space->array,sparserow,cnzi*sizeof(PetscInt));CHKERRQ(ierr);
1184       current_space->array           += cnzi;
1185       current_space->local_used      += cnzi;
1186       current_space->local_remaining -= cnzi;
1187 
1188       for (j=0;j<ptanzi;j++) {
1189         ptadenserow[ptasparserow[j]] = 0;
1190       }
1191       for (j=0;j<cnzi;j++) {
1192         denserow[sparserow[j]] = 0;
1193       }
1194       /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1195       /*        For now, we will recompute what is needed. */
1196       ci[i+1+dof] = ci[i+dof] + cnzi;
1197     }
1198   }
1199   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1200   /* Allocate space for cj, initialize cj, and */
1201   /* destroy list of free space and other temporary array(s) */
1202   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1203   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1204   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
1205 
1206   /* Allocate space for ca */
1207   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1208   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1209 
1210   /* put together the new matrix */
1211   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
1212 
1213   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1214   /* Since these are PETSc arrays, change flags to free them as necessary. */
1215   c = (Mat_SeqAIJ *)((*C)->data);
1216   c->freedata = PETSC_TRUE;
1217   c->nonew    = 0;
1218 
1219   /* Clean up. */
1220   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1221 
1222   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1223   PetscFunctionReturn(0);
1224 }
1225 EXTERN_C_END
1226 
1227 /*
1228    MatPtAPNumeric - Computes the matrix projection C = P^T * A * P
1229 
1230    Collective on Mat
1231 
1232    Input Parameters:
1233 +  A - the matrix
1234 -  P - the projection matrix
1235 
1236    Output Parameters:
1237 .  C - the product matrix
1238 
1239    Notes:
1240    C must have been created by calling MatPtAPSymbolic and must be destroyed by
1241    the user using MatDeatroy().
1242 
1243    This routine is currently only implemented for pairs of AIJ matrices and classes
1244    which inherit from AIJ.  C will be of type MATAIJ.
1245 
1246    Level: intermediate
1247 
1248 .seealso: MatPtAP(),MatPtAPSymbolic(),MatMatMultNumeric()
1249 */
1250 #undef __FUNCT__
1251 #define __FUNCT__ "MatPtAPNumeric"
1252 PetscErrorCode MatPtAPNumeric(Mat A,Mat P,Mat C)
1253 {
1254   PetscErrorCode ierr;
1255   PetscErrorCode (*fA)(Mat,Mat,Mat);
1256   PetscErrorCode (*fP)(Mat,Mat,Mat);
1257 
1258   PetscFunctionBegin;
1259 
1260   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
1261   PetscValidType(A,1);
1262   MatPreallocated(A);
1263   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1264   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1265 
1266   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
1267   PetscValidType(P,2);
1268   MatPreallocated(P);
1269   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1270   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1271 
1272   PetscValidHeaderSpecific(C,MAT_COOKIE,3);
1273   PetscValidType(C,3);
1274   MatPreallocated(C);
1275   if (!C->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1276   if (C->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1277 
1278   if (P->N!=C->M) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->M);
1279   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
1280   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
1281   if (P->N!=C->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->N);
1282 
1283   /* For now, we do not dispatch based on the type of A and P */
1284   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
1285   fA = A->ops->ptapnumeric;
1286   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for A of type %s",A->type_name);
1287   fP = P->ops->ptapnumeric;
1288   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for P of type %s",P->type_name);
1289   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPNumeric requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
1290 
1291   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1292   ierr = (*fA)(A,P,C);CHKERRQ(ierr);
1293   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1294 
1295   PetscFunctionReturn(0);
1296 }
1297 
1298 #undef __FUNCT__
1299 #define __FUNCT__ "MatPtAPNumeric_SeqAIJ_SeqAIJ"
1300 PetscErrorCode MatPtAPNumeric_SeqAIJ_SeqAIJ(Mat A,Mat P,Mat C)
1301 {
1302   PetscErrorCode ierr;
1303   PetscInt       flops=0;
1304   Mat_SeqAIJ     *a  = (Mat_SeqAIJ *) A->data;
1305   Mat_SeqAIJ     *p  = (Mat_SeqAIJ *) P->data;
1306   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
1307   PetscInt       *ai=a->i,*aj=a->j,*apj,*apjdense,*pi=p->i,*pj=p->j,*pJ=p->j,*pjj;
1308   PetscInt       *ci=c->i,*cj=c->j,*cjj;
1309   PetscInt       am=A->M,cn=C->N,cm=C->M;
1310   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1311   MatScalar      *aa=a->a,*apa,*pa=p->a,*pA=p->a,*paj,*ca=c->a,*caj;
1312 
1313   PetscFunctionBegin;
1314   /* Allocate temporary array for storage of one row of A*P */
1315   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1316   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
1317 
1318   apj      = (PetscInt *)(apa + cn);
1319   apjdense = apj + cn;
1320 
1321   /* Clear old values in C */
1322   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1323 
1324   for (i=0;i<am;i++) {
1325     /* Form sparse row of A*P */
1326     anzi  = ai[i+1] - ai[i];
1327     apnzj = 0;
1328     for (j=0;j<anzi;j++) {
1329       prow = *aj++;
1330       pnzj = pi[prow+1] - pi[prow];
1331       pjj  = pj + pi[prow];
1332       paj  = pa + pi[prow];
1333       for (k=0;k<pnzj;k++) {
1334         if (!apjdense[pjj[k]]) {
1335           apjdense[pjj[k]] = -1;
1336           apj[apnzj++]     = pjj[k];
1337         }
1338         apa[pjj[k]] += (*aa)*paj[k];
1339       }
1340       flops += 2*pnzj;
1341       aa++;
1342     }
1343 
1344     /* Sort the j index array for quick sparse axpy. */
1345     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1346 
1347     /* Compute P^T*A*P using outer product (P^T)[:,j]*(A*P)[j,:]. */
1348     pnzi = pi[i+1] - pi[i];
1349     for (j=0;j<pnzi;j++) {
1350       nextap = 0;
1351       crow   = *pJ++;
1352       cjj    = cj + ci[crow];
1353       caj    = ca + ci[crow];
1354       /* Perform sparse axpy operation.  Note cjj includes apj. */
1355       for (k=0;nextap<apnzj;k++) {
1356         if (cjj[k]==apj[nextap]) {
1357           caj[k] += (*pA)*apa[apj[nextap++]];
1358         }
1359       }
1360       flops += 2*apnzj;
1361       pA++;
1362     }
1363 
1364     /* Zero the current row info for A*P */
1365     for (j=0;j<apnzj;j++) {
1366       apa[apj[j]]      = 0.;
1367       apjdense[apj[j]] = 0;
1368     }
1369   }
1370 
1371   /* Assemble the final matrix and clean up */
1372   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1373   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1374   ierr = PetscFree(apa);CHKERRQ(ierr);
1375   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1376 
1377   PetscFunctionReturn(0);
1378 }
1379 
1380 /* Compute local C = P_loc^T * A * P - used by MatPtAP_MPIAIJ_MPIAIJ() */
1381 static PetscEvent logkey_matptapnumeric_local = 0;
1382 #undef __FUNCT__
1383 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ_Local"
1384 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,Mat C)
1385 {
1386   PetscErrorCode ierr;
1387   PetscInt       flops=0;
1388   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
1389   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1390   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
1391   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
1392   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
1393   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col;
1394   PetscInt       *pJ=pj_loc,*pjj;
1395   PetscInt       *ci=c->i,*cj=c->j,*cjj;
1396   PetscInt       am=A->m,cn=C->N,cm=C->M;
1397   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1398   MatScalar      *ada=ad->a,*aoa=ao->a,*apa,*paj,*ca=c->a,*caj;
1399   MatScalar      *pa_loc=p_loc->a,*pA=pa_loc,*pa_oth=p_oth->a;
1400 
1401   PetscFunctionBegin;
1402   if (!logkey_matptapnumeric_local) {
1403     ierr = PetscLogEventRegister(&logkey_matptapnumeric_local,"MatPtAPNumeric_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);CHKERRQ(ierr);
1404   }
1405   ierr = PetscLogEventBegin(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
1406 
1407   /* Allocate temporary array for storage of one row of A*P */
1408   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1409   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
1410   apj      = (PetscInt *)(apa + cn);
1411   apjdense = apj + cn;
1412 
1413   /* Clear old values in C */
1414   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1415 
1416   for (i=0;i<am;i++) {
1417     /* Form i-th sparse row of A*P */
1418      apnzj = 0;
1419     /* diagonal portion of A */
1420     anzi  = adi[i+1] - adi[i];
1421     for (j=0;j<anzi;j++) {
1422       prow = *adj;
1423       adj++;
1424       pnzj = pi_loc[prow+1] - pi_loc[prow];
1425       pjj  = pj_loc + pi_loc[prow];
1426       paj  = pa_loc + pi_loc[prow];
1427       for (k=0;k<pnzj;k++) {
1428         if (!apjdense[pjj[k]]) {
1429           apjdense[pjj[k]] = -1;
1430           apj[apnzj++]     = pjj[k];
1431         }
1432         apa[pjj[k]] += (*ada)*paj[k];
1433       }
1434       flops += 2*pnzj;
1435       ada++;
1436     }
1437     /* off-diagonal portion of A */
1438     anzi  = aoi[i+1] - aoi[i];
1439     for (j=0;j<anzi;j++) {
1440       col = a->garray[*aoj];
1441       if (col < cstart){
1442         prow = *aoj;
1443       } else if (col >= cend){
1444         prow = *aoj;
1445       } else {
1446         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
1447       }
1448       aoj++;
1449       pnzj = pi_oth[prow+1] - pi_oth[prow];
1450       pjj  = pj_oth + pi_oth[prow];
1451       paj  = pa_oth + pi_oth[prow];
1452       for (k=0;k<pnzj;k++) {
1453         if (!apjdense[pjj[k]]) {
1454           apjdense[pjj[k]] = -1;
1455           apj[apnzj++]     = pjj[k];
1456         }
1457         apa[pjj[k]] += (*aoa)*paj[k];
1458       }
1459       flops += 2*pnzj;
1460       aoa++;
1461     }
1462     /* Sort the j index array for quick sparse axpy. */
1463     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1464 
1465     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
1466     pnzi = pi_loc[i+1] - pi_loc[i];
1467     for (j=0;j<pnzi;j++) {
1468       nextap = 0;
1469       crow   = *pJ++;
1470       cjj    = cj + ci[crow];
1471       caj    = ca + ci[crow];
1472       /* Perform sparse axpy operation.  Note cjj includes apj. */
1473       for (k=0;nextap<apnzj;k++) {
1474         if (cjj[k]==apj[nextap]) {
1475           caj[k] += (*pA)*apa[apj[nextap++]];
1476         }
1477       }
1478       flops += 2*apnzj;
1479       pA++;
1480     }
1481 
1482     /* Zero the current row info for A*P */
1483     for (j=0;j<apnzj;j++) {
1484       apa[apj[j]]      = 0.;
1485       apjdense[apj[j]] = 0;
1486     }
1487   }
1488 
1489   /* Assemble the final matrix and clean up */
1490   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1491   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1492   ierr = PetscFree(apa);CHKERRQ(ierr);
1493   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1494   ierr = PetscLogEventEnd(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
1495   PetscFunctionReturn(0);
1496 }
1497 static PetscEvent logkey_matptapsymbolic_local = 0;
1498 #undef __FUNCT__
1499 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local"
1500 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,PetscReal fill,Mat *C)
1501 {
1502   PetscErrorCode ierr;
1503   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1504   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
1505   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*c;
1506   PetscInt       *pti,*ptj,*ptJ,*ajj,*pjj;
1507   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
1508   PetscInt       *ci,*cj,*ptaj;
1509   PetscInt       aN=A->N,am=A->m,pN=P_loc->N;
1510   PetscInt       i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi;
1511   PetscInt       pm=P_loc->m,nlnk,*lnk;
1512   MatScalar      *ca;
1513   PetscBT        lnkbt;
1514   PetscInt       prend,nprow_loc,nprow_oth;
1515   PetscInt       *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
1516   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
1517   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
1518 
1519   PetscFunctionBegin;
1520   if (!logkey_matptapsymbolic_local) {
1521     ierr = PetscLogEventRegister(&logkey_matptapsymbolic_local,"MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);
1522   }
1523   ierr = PetscLogEventBegin(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
1524 
1525   prend = prstart + pm;
1526 
1527   /* get ij structure of P_loc^T */
1528   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1529   ptJ=ptj;
1530 
1531   /* Allocate ci array, arrays for fill computation and */
1532   /* free space for accumulating nonzero column info */
1533   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1534   ci[0] = 0;
1535 
1536   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
1537   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
1538   ptasparserow_loc = ptadenserow_loc + aN;
1539   ptadenserow_oth  = ptasparserow_loc + aN;
1540   ptasparserow_oth = ptadenserow_oth + aN;
1541 
1542   /* create and initialize a linked list */
1543   nlnk = pN+1;
1544   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1545 
1546   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
1547   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1548   nnz           = adi[am] + aoi[am];
1549   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
1550   current_space = free_space;
1551 
1552   /* determine symbolic info for each row of C: */
1553   for (i=0;i<pN;i++) {
1554     ptnzi  = pti[i+1] - pti[i];
1555     nprow_loc = 0; nprow_oth = 0;
1556     /* i-th row of symbolic P_loc^T*A_loc: */
1557     for (j=0;j<ptnzi;j++) {
1558       arow = *ptJ++;
1559       /* diagonal portion of A */
1560       anzj = adi[arow+1] - adi[arow];
1561       ajj  = adj + adi[arow];
1562       for (k=0;k<anzj;k++) {
1563         col = ajj[k]+prstart;
1564         if (!ptadenserow_loc[col]) {
1565           ptadenserow_loc[col]    = -1;
1566           ptasparserow_loc[nprow_loc++] = col;
1567         }
1568       }
1569       /* off-diagonal portion of A */
1570       anzj = aoi[arow+1] - aoi[arow];
1571       ajj  = aoj + aoi[arow];
1572       for (k=0;k<anzj;k++) {
1573         col = a->garray[ajj[k]];  /* global col */
1574         if (col < cstart){
1575           col = ajj[k];
1576         } else if (col >= cend){
1577           col = ajj[k] + pm;
1578         } else {
1579           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
1580         }
1581         if (!ptadenserow_oth[col]) {
1582           ptadenserow_oth[col]    = -1;
1583           ptasparserow_oth[nprow_oth++] = col;
1584         }
1585       }
1586     }
1587 
1588     /* using symbolic info of local PtA, determine symbolic info for row of C: */
1589     cnzi   = 0;
1590     /* rows of P_loc */
1591     ptaj = ptasparserow_loc;
1592     for (j=0; j<nprow_loc; j++) {
1593       prow = *ptaj++;
1594       prow -= prstart; /* rm */
1595       pnzj = pi_loc[prow+1] - pi_loc[prow];
1596       pjj  = pj_loc + pi_loc[prow];
1597       /* add non-zero cols of P into the sorted linked list lnk */
1598       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1599       cnzi += nlnk;
1600     }
1601     /* rows of P_oth */
1602     ptaj = ptasparserow_oth;
1603     for (j=0; j<nprow_oth; j++) {
1604       prow = *ptaj++;
1605       if (prow >= prend) prow -= pm; /* rm */
1606       pnzj = pi_oth[prow+1] - pi_oth[prow];
1607       pjj  = pj_oth + pi_oth[prow];
1608       /* add non-zero cols of P into the sorted linked list lnk */
1609       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1610       cnzi += nlnk;
1611     }
1612 
1613     /* If free space is not available, make more free space */
1614     /* Double the amount of total space in the list */
1615     if (current_space->local_remaining<cnzi) {
1616       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1617     }
1618 
1619     /* Copy data into free space, and zero out denserows */
1620     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1621     current_space->array           += cnzi;
1622     current_space->local_used      += cnzi;
1623     current_space->local_remaining -= cnzi;
1624 
1625     for (j=0;j<nprow_loc; j++) {
1626       ptadenserow_loc[ptasparserow_loc[j]] = 0;
1627     }
1628     for (j=0;j<nprow_oth; j++) {
1629       ptadenserow_oth[ptasparserow_oth[j]] = 0;
1630     }
1631 
1632     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1633     /*        For now, we will recompute what is needed. */
1634     ci[i+1] = ci[i] + cnzi;
1635   }
1636   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1637   /* Allocate space for cj, initialize cj, and */
1638   /* destroy list of free space and other temporary array(s) */
1639   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1640   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1641   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
1642   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1643 
1644   /* Allocate space for ca */
1645   ierr = PetscMalloc((ci[pN]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1646   ierr = PetscMemzero(ca,(ci[pN]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1647 
1648   /* put together the new matrix */
1649   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,pN,pN,ci,cj,ca,C);CHKERRQ(ierr);
1650 
1651   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1652   /* Since these are PETSc arrays, change flags to free them as necessary. */
1653   c = (Mat_SeqAIJ *)((*C)->data);
1654   c->freedata = PETSC_TRUE;
1655   c->nonew    = 0;
1656 
1657   /* Clean up. */
1658   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1659 
1660   ierr = PetscLogEventEnd(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
1661   PetscFunctionReturn(0);
1662 }
1663