xref: /petsc/src/mat/impls/aij/seq/matptap.c (revision 44b1ac3a359c970a16e21892fa52f90f5f9cf3d3)
1 /*
2   Defines projective product routines where A is a AIJ matrix
3           C = P^T * A * P
4 */
5 
6 #include "src/mat/impls/aij/seq/aij.h"   /*I "petscmat.h" I*/
7 #include "src/mat/utils/freespace.h"
8 #include "src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatPtAP"
13 /*@
14    MatPtAP - Creates the matrix projection C = P^T * A * P
15 
16    Collective on Mat
17 
18    Input Parameters:
19 +  A - the matrix
20 .  P - the projection matrix
21 .  scall - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
22 -  fill - expected fill as ratio of nnz(C)/(nnz(A) + nnz(P))
23 
24    Output Parameters:
25 .  C - the product matrix
26 
27    Notes:
28    C will be created and must be destroyed by the user with MatDestroy().
29 
30    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
31    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.
32 
33    Level: intermediate
34 
35 .seealso: MatPtAPSymbolic(),MatPtAPNumeric(),MatMatMult()
36 @*/
37 PetscErrorCode MatPtAP(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
38 {
39   PetscErrorCode ierr;
40   PetscErrorCode (*fA)(Mat,Mat,MatReuse,PetscReal,Mat *);
41   PetscErrorCode (*fP)(Mat,Mat,MatReuse,PetscReal,Mat *);
42 
43   PetscFunctionBegin;
44   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
45   PetscValidType(A,1);
46   MatPreallocated(A);
47   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
48   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
49   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
50   PetscValidType(P,2);
51   MatPreallocated(P);
52   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
53   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
54   PetscValidPointer(C,3);
55 
56   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
57 
58   if (fill <=0.0) SETERRQ1(PETSC_ERR_ARG_SIZ,"fill=%g must be > 0.0",fill);
59 
60   /* For now, we do not dispatch based on the type of A and P */
61   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
62   fA = A->ops->ptap;
63   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for A of type %s",A->type_name);
64   fP = P->ops->ptap;
65   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for P of type %s",P->type_name);
66   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAP requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
67 
68   ierr = PetscLogEventBegin(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
69   ierr = (*fA)(A,P,scall,fill,C);CHKERRQ(ierr);
70   ierr = PetscLogEventEnd(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
71   PetscFunctionReturn(0);
72 }
73 
74 EXTERN PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,PetscReal,Mat*);
75 EXTERN PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,Mat C);
76 
77 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
78 #undef __FUNCT__
79 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
80 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
81 {
82   PetscErrorCode       ierr;
83   Mat_MatMatMultMPI    *ptap;
84   PetscObjectContainer container;
85 
86   PetscFunctionBegin;
87   ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
88   if (container) {
89     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
90     ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr);
91     ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr);
92     ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr);
93     ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr);
94 
95     ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
96     ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr);
97   }
98   ierr = PetscFree(ptap);CHKERRQ(ierr);
99 
100   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
101   PetscFunctionReturn(0);
102 }
103 
104 #undef __FUNCT__
105 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
106 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
107 {
108   PetscErrorCode       ierr;
109   Mat_MatMatMultMPI    *ptap;
110   PetscObjectContainer container;
111 
112   PetscFunctionBegin;
113   if (scall == MAT_INITIAL_MATRIX){
114     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
115     ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr);
116 
117     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
118     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
119 
120     /* get P_loc by taking all local rows of P */
121     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
122 
123     /* attach the supporting struct to P for reuse */
124     P->ops->destroy  = MatDestroy_MPIAIJ_PtAP;
125     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
126     ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr);
127     ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr);
128 
129     /* now, compute symbolic local P^T*A*P */
130     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */
131     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
132   } else if (scall == MAT_REUSE_MATRIX){
133     ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
134     if (container) {
135       ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
136     } else {
137       SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
138     }
139 
140     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
141     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
142 
143     /* get P_loc by taking all local rows of P */
144     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
145 
146   } else {
147     SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall);
148   }
149   /* now, compute numeric local P^T*A*P */
150   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
151   ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
152   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
153 
154   PetscFunctionReturn(0);
155 }
156 
157 #undef __FUNCT__
158 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
159 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
160 {
161   PetscErrorCode       ierr;
162   Mat                  P_loc,P_oth;
163   Mat_MatMatMultMPI    *ptap;
164   PetscObjectContainer container;
165   FreeSpaceList        free_space=PETSC_NULL,current_space=PETSC_NULL;
166   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data;
167   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*c;
168   Mat_SeqAIJ           *p_loc,*p_oth;
169   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*ajj,*pjj;
170   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
171   PetscInt             pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji,*ptaj;
172   PetscInt             aN=A->N,am=A->m,pN=P->N;
173   PetscInt             i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi;
174   MatScalar            *ca;
175   PetscBT              lnkbt;
176   PetscInt             prstart,prend,nprow_loc,nprow_oth;
177   PetscInt             *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
178 
179   MPI_Comm             comm=A->comm;
180   Mat                  B_mpi;
181   PetscMPIInt          size,rank,tagi,tagj,*len_s,*len_si,*len_ri;
182   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
183   PetscInt             len,proc,*dnz,*onz,*owners;
184   PetscInt             anzi,*bi,*bj,bnzi,nspacedouble=0;
185   PetscInt             nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
186   MPI_Request          *si_waits,*sj_waits,*ri_waits,*rj_waits;
187   MPI_Status           *status;
188   Mat_Merge_SeqsToMPI  *merge;
189 
190   PetscFunctionBegin;
191   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
192   if (container) {
193     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
194   } else {
195     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
196   }
197   /* compute symbolic C_seq = P_loc^T * A_loc * P_seq */
198   /*--------------------------------------------------*/
199   /* get data from P_loc and P_oth */
200   P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart;
201   p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
202   pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
203   prend = prstart + pm;
204 
205   /* get ij structure of P_loc^T */
206   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
207   ptJ=ptj;
208 
209   /* Allocate ci array, arrays for fill computation and */
210   /* free space for accumulating nonzero column info */
211   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
212   ci[0] = 0;
213 
214   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
215   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
216   ptasparserow_loc = ptadenserow_loc + aN;
217   ptadenserow_oth  = ptasparserow_loc + aN;
218   ptasparserow_oth = ptadenserow_oth + aN;
219 
220   /* create and initialize a linked list */
221   nlnk = pN+1;
222   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
223 
224   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
225   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
226   nnz           = adi[am] + aoi[am];
227   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
228   current_space = free_space;
229 
230   /* determine symbolic info for each row of C: */
231   for (i=0;i<pN;i++) {
232     ptnzi  = pti[i+1] - pti[i];
233     nprow_loc = 0; nprow_oth = 0;
234     /* i-th row of symbolic P_loc^T*A_loc: */
235     for (j=0;j<ptnzi;j++) {
236       arow = *ptJ++;
237       /* diagonal portion of A */
238       anzj = adi[arow+1] - adi[arow];
239       ajj  = adj + adi[arow];
240       for (k=0;k<anzj;k++) {
241         col = ajj[k]+prstart;
242         if (!ptadenserow_loc[col]) {
243           ptadenserow_loc[col]    = -1;
244           ptasparserow_loc[nprow_loc++] = col;
245         }
246       }
247       /* off-diagonal portion of A */
248       anzj = aoi[arow+1] - aoi[arow];
249       ajj  = aoj + aoi[arow];
250       for (k=0;k<anzj;k++) {
251         col = a->garray[ajj[k]];  /* global col */
252         if (col < cstart){
253           col = ajj[k];
254         } else if (col >= cend){
255           col = ajj[k] + pm;
256         } else {
257           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
258         }
259         if (!ptadenserow_oth[col]) {
260           ptadenserow_oth[col]    = -1;
261           ptasparserow_oth[nprow_oth++] = col;
262         }
263       }
264     }
265 
266     /* using symbolic info of local PtA, determine symbolic info for row of C: */
267     cnzi   = 0;
268     /* rows of P_loc */
269     ptaj = ptasparserow_loc;
270     for (j=0; j<nprow_loc; j++) {
271       prow = *ptaj++;
272       prow -= prstart; /* rm */
273       pnzj = pi_loc[prow+1] - pi_loc[prow];
274       pjj  = pj_loc + pi_loc[prow];
275       /* add non-zero cols of P into the sorted linked list lnk */
276       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
277       cnzi += nlnk;
278     }
279     /* rows of P_oth */
280     ptaj = ptasparserow_oth;
281     for (j=0; j<nprow_oth; j++) {
282       prow = *ptaj++;
283       if (prow >= prend) prow -= pm; /* rm */
284       pnzj = pi_oth[prow+1] - pi_oth[prow];
285       pjj  = pj_oth + pi_oth[prow];
286       /* add non-zero cols of P into the sorted linked list lnk */
287       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
288       cnzi += nlnk;
289     }
290 
291     /* If free space is not available, make more free space */
292     /* Double the amount of total space in the list */
293     if (current_space->local_remaining<cnzi) {
294       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
295     }
296 
297     /* Copy data into free space, and zero out denserows */
298     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
299     current_space->array           += cnzi;
300     current_space->local_used      += cnzi;
301     current_space->local_remaining -= cnzi;
302 
303     for (j=0;j<nprow_loc; j++) {
304       ptadenserow_loc[ptasparserow_loc[j]] = 0;
305     }
306     for (j=0;j<nprow_oth; j++) {
307       ptadenserow_oth[ptasparserow_oth[j]] = 0;
308     }
309 
310     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
311     /*        For now, we will recompute what is needed. */
312     ci[i+1] = ci[i] + cnzi;
313   }
314   /* Clean up. */
315   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
316 
317   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
318   /* Allocate space for cj, initialize cj, and */
319   /* destroy list of free space and other temporary array(s) */
320   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
321   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
322   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
323   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
324 
325   /* Allocate space for ca */
326   /*
327   ierr = PetscMalloc((ci[pN]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
328   ierr = PetscMemzero(ca,(ci[pN]+1)*sizeof(MatScalar));CHKERRQ(ierr);
329   */
330 
331   /* add C_seq into mpi C              */
332   /*-----------------------------------*/
333   free_space=PETSC_NULL; current_space=PETSC_NULL;
334 
335   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
336   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
337 
338   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
339 
340 
341   /* determine row ownership */
342   /*---------------------------------------------------------*/
343   ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr);
344   ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr);
345   ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr);
346   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
347 
348   /* determine the number of messages to send, their lengths */
349   /*---------------------------------------------------------*/
350   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
351   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
352   len_s  = merge->len_s;
353   len = 0;  /* length of buf_si[] */
354   merge->nsend = 0;
355   for (proc=0; proc<size; proc++){
356     len_si[proc] = 0;
357     if (proc == rank){
358       len_s[proc] = 0;
359     } else {
360       len_si[proc] = owners[proc+1] - owners[proc] + 1;
361       len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of rows to be sent to [proc] */
362     }
363     if (len_s[proc]) {
364       merge->nsend++;
365       nrows = 0;
366       for (i=owners[proc]; i<owners[proc+1]; i++){
367         if (ci[i+1] > ci[i]) nrows++;
368       }
369       len_si[proc] = 2*(nrows+1);
370       len += len_si[proc];
371     }
372   }
373 
374   /* determine the number and length of messages to receive for ij-structure */
375   /*-------------------------------------------------------------------------*/
376   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
377   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
378 
379   /* post the Irecv of j-structure */
380   /*-------------------------------*/
381   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr);
382   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr);
383 
384   /* post the Isend of j-structure */
385   /*--------------------------------*/
386   ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr);
387   sj_waits = si_waits + merge->nsend;
388 
389   for (proc=0, k=0; proc<size; proc++){
390     if (!len_s[proc]) continue;
391     i = owners[proc];
392     ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr);
393     k++;
394   }
395 
396   /* receives and sends of j-structure are complete */
397   /*------------------------------------------------*/
398   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
399   ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr);
400   ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr);
401 
402   /* send and recv i-structure */
403   /*---------------------------*/
404   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr);
405   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr);
406 
407   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
408   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
409   for (proc=0,k=0; proc<size; proc++){
410     if (!len_s[proc]) continue;
411     /* form outgoing message for i-structure:
412          buf_si[0]:                 nrows to be sent
413                [1:nrows]:           row index (global)
414                [nrows+1:2*nrows+1]: i-structure index
415     */
416     /*-------------------------------------------*/
417     nrows = len_si[proc]/2 - 1;
418     buf_si_i    = buf_si + nrows+1;
419     buf_si[0]   = nrows;
420     buf_si_i[0] = 0;
421     nrows = 0;
422     for (i=owners[proc]; i<owners[proc+1]; i++){
423       anzi = ci[i+1] - ci[i];
424       if (anzi) {
425         buf_si_i[nrows+1] = buf_si_i[nrows] + anzi; /* i-structure */
426         buf_si[nrows+1] = i-owners[proc]; /* local row index */
427         nrows++;
428       }
429     }
430     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr);
431     k++;
432     buf_si += len_si[proc];
433   }
434 
435   ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr);
436   ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr);
437 
438   ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr);
439   for (i=0; i<merge->nrecv; i++){
440     ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI:   recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr);
441   }
442 
443   ierr = PetscFree(len_si);CHKERRQ(ierr);
444   ierr = PetscFree(len_ri);CHKERRQ(ierr);
445   ierr = PetscFree(rj_waits);CHKERRQ(ierr);
446   ierr = PetscFree(si_waits);CHKERRQ(ierr);
447   ierr = PetscFree(ri_waits);CHKERRQ(ierr);
448   ierr = PetscFree(buf_s);CHKERRQ(ierr);
449   ierr = PetscFree(status);CHKERRQ(ierr);
450 
451   /* compute a local seq matrix in each processor */
452   /*----------------------------------------------*/
453   /* allocate bi array and free space for accumulating nonzero column info */
454   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
455   bi[0] = 0;
456 
457   /* create and initialize a linked list */
458   nlnk = pN+1;
459   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
460 
461   /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */
462   len = 0;
463   len  = ci[owners[rank+1]] - ci[owners[rank]];
464   ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr);
465   current_space = free_space;
466 
467   /* determine symbolic info for each local row */
468   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
469   nextrow = buf_ri_k + merge->nrecv;
470   nextci  = nextrow + merge->nrecv;
471   for (k=0; k<merge->nrecv; k++){
472     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
473     nrows = *buf_ri_k[k];
474     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
475     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
476   }
477 
478   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
479   len = 0;
480   for (i=0; i<pn; i++) {
481     bnzi   = 0;
482     /* add local non-zero cols of this proc's C_seq into lnk */
483     arow   = owners[rank] + i;
484     anzi   = ci[arow+1] - ci[arow];
485     cji    = cj + ci[arow];
486     ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
487     bnzi += nlnk;
488     /* add received col data into lnk */
489     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
490       if (i == *nextrow[k]) { /* i-th row */
491         anzi = *(nextci[k]+1) - *nextci[k];
492         cji  = buf_rj[k] + *nextci[k];
493         ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
494         bnzi += nlnk;
495         nextrow[k]++; nextci[k]++;
496       }
497     }
498     if (len < bnzi) len = bnzi;  /* =max(bnzi) */
499 
500     /* if free space is not available, make more free space */
501     if (current_space->local_remaining<bnzi) {
502       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
503       nspacedouble++;
504     }
505     /* copy data into free space, then initialize lnk */
506     ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
507     ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr);
508 
509     current_space->array           += bnzi;
510     current_space->local_used      += bnzi;
511     current_space->local_remaining -= bnzi;
512 
513     bi[i+1] = bi[i] + bnzi;
514   }
515 
516   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
517 
518   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
519   ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
520   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
521 
522   /* create symbolic parallel matrix B_mpi */
523   /*---------------------------------------*/
524   ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr);
525   ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr);
526   ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr);
527   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
528 
529   /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */
530   B_mpi->assembled     = PETSC_FALSE;
531   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_SeqsToMPI;
532   merge->bi            = bi;
533   merge->bj            = bj;
534   merge->ci            = ci;
535   merge->cj            = cj;
536   merge->buf_ri        = buf_ri;
537   merge->buf_rj        = buf_rj;
538   merge->C_seq         = PETSC_NULL;
539 
540   /* attach the supporting struct to B_mpi for reuse */
541   ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
542   ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr);
543   ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr);
544   *C = B_mpi;
545 
546   PetscFunctionReturn(0);
547 
548 #ifdef TOBEREMOVED
549   PetscErrorCode       ierr;
550   Mat                  C_seq;
551   Mat_MatMatMultMPI    *ptap;
552   PetscObjectContainer container;
553 
554   PetscFunctionBegin;
555 
556   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
557   if (container) {
558     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
559   } else {
560     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
561   }
562   /* compute C_seq = P_loc^T * A_loc * P_seq */
563   ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(A,ptap->B_loc,ptap->B_oth,ptap->brstart,fill,&C_seq);CHKERRQ(ierr);
564 
565   /* add C_seq into mpi C */
566   ierr = MatMerge_SeqsToMPISymbolic(A->comm,C_seq,P->n,P->n,C);CHKERRQ(ierr);
567 
568   PetscFunctionReturn(0);
569 #endif /* TOBEREMOVED */
570 }
571 
572 #undef __FUNCT__
573 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
574 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
575 {
576   PetscErrorCode       ierr;
577   Mat_Merge_SeqsToMPI  *merge;
578   Mat_MatMatMultMPI    *ptap;
579   PetscObjectContainer cont_merge,cont_ptap;
580 
581   PetscInt       flops=0;
582   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
583   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
584   Mat_SeqAIJ     *p_loc,*p_oth;
585   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col;
586   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pjj;
587   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
588   PetscInt       *cjj;
589   MatScalar      *ada=ad->a,*aoa=ao->a,*apa,*paj,*cseqa,*caj;
590   MatScalar      *pa_loc,*pA,*pa_oth;
591   PetscInt       am=A->m,cN=C->N;
592 
593   MPI_Comm             comm=C->comm;
594   PetscMPIInt          size,rank,taga,*len_s;
595   PetscInt             *owners;
596   PetscInt             proc;
597   PetscInt             **buf_ri,**buf_rj;
598   PetscInt             cseqnzi,*bj_i,*bi,*bj,cseqrow,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */
599   PetscInt             nrows,**buf_ri_k,**nextrow,**nextcseqi;
600   MPI_Request          *s_waits,*r_waits;
601   MPI_Status           *status;
602   MatScalar            **abuf_r,*ba_i;
603   PetscInt             *cseqi,*cseqj;
604   PetscInt             *cseqj_tmp;
605   MatScalar            *cseqa_tmp;
606 
607   PetscFunctionBegin;
608   ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr);
609   if (cont_merge) {
610     ierr  = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr);
611   } else {
612     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
613   }
614   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr);
615   if (cont_ptap) {
616     ierr  = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr);
617   } else {
618     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
619   }
620   /* get data from symbolic products */
621   p_loc=(Mat_SeqAIJ*)(ptap->B_loc)->data;
622   p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data;
623   pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a; pA=pa_loc;
624   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
625 
626   cseqi = merge->ci; cseqj=merge->cj;
627   ierr  = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr);
628   ierr  = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
629 
630   /* Allocate temporary array for storage of one row of A*P */
631   ierr = PetscMalloc(cN*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
632   ierr = PetscMemzero(apa,cN*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
633   apj      = (PetscInt *)(apa + cN);
634   apjdense = apj + cN;
635 
636   /* Clear old values in C_Seq */
637   ierr = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
638 
639   for (i=0;i<am;i++) {
640     /* Form i-th sparse row of A*P */
641      apnzj = 0;
642     /* diagonal portion of A */
643     anzi  = adi[i+1] - adi[i];
644     for (j=0;j<anzi;j++) {
645       prow = *adj;
646       adj++;
647       pnzj = pi_loc[prow+1] - pi_loc[prow];
648       pjj  = pj_loc + pi_loc[prow];
649       paj  = pa_loc + pi_loc[prow];
650       for (k=0;k<pnzj;k++) {
651         if (!apjdense[pjj[k]]) {
652           apjdense[pjj[k]] = -1;
653           apj[apnzj++]     = pjj[k];
654         }
655         apa[pjj[k]] += (*ada)*paj[k];
656       }
657       flops += 2*pnzj;
658       ada++;
659     }
660     /* off-diagonal portion of A */
661     anzi  = aoi[i+1] - aoi[i];
662     for (j=0;j<anzi;j++) {
663       col = a->garray[*aoj];
664       if (col < cstart){
665         prow = *aoj;
666       } else if (col >= cend){
667         prow = *aoj;
668       } else {
669         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
670       }
671       aoj++;
672       pnzj = pi_oth[prow+1] - pi_oth[prow];
673       pjj  = pj_oth + pi_oth[prow];
674       paj  = pa_oth + pi_oth[prow];
675       for (k=0;k<pnzj;k++) {
676         if (!apjdense[pjj[k]]) {
677           apjdense[pjj[k]] = -1;
678           apj[apnzj++]     = pjj[k];
679         }
680         apa[pjj[k]] += (*aoa)*paj[k];
681       }
682       flops += 2*pnzj;
683       aoa++;
684     }
685     /* Sort the j index array for quick sparse axpy. */
686     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
687 
688     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
689     pnzi = pi_loc[i+1] - pi_loc[i];
690     for (j=0;j<pnzi;j++) {
691       nextap = 0;
692       crow   = *pJ++;
693       cjj    = cseqj + cseqi[crow];
694       caj    = cseqa + cseqi[crow];
695       /* Perform sparse axpy operation.  Note cjj includes apj. */
696       for (k=0;nextap<apnzj;k++) {
697         if (cjj[k]==apj[nextap]) {
698           caj[k] += (*pA)*apa[apj[nextap++]];
699         }
700       }
701       flops += 2*apnzj;
702       pA++;
703     }
704 
705     /* Zero the current row info for A*P */
706     for (j=0;j<apnzj;j++) {
707       apa[apj[j]]      = 0.;
708       apjdense[apj[j]] = 0;
709     }
710   }
711 
712   ierr = PetscFree(apa);CHKERRQ(ierr);
713   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
714 
715   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
716   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
717 
718   bi     = merge->bi;
719   bj     = merge->bj;
720   buf_ri = merge->buf_ri;
721   buf_rj = merge->buf_rj;
722 
723   ierr   = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
724   ierr   = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
725 
726   /* send and recv matrix values */
727   /*-----------------------------*/
728   len_s  = merge->len_s;
729   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr);
730   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
731 
732   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr);
733   for (proc=0,k=0; proc<size; proc++){
734     if (!len_s[proc]) continue;
735     i = owners[proc];
736     ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
737     k++;
738   }
739 
740   ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);
741   ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);
742   ierr = PetscFree(status);CHKERRQ(ierr);
743 
744   ierr = PetscFree(s_waits);CHKERRQ(ierr);
745   ierr = PetscFree(r_waits);CHKERRQ(ierr);
746 
747   /* insert mat values of mpimat */
748   /*----------------------------*/
749   ierr = PetscMalloc(cN*sizeof(MatScalar),&ba_i);CHKERRQ(ierr);
750   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
751   nextrow = buf_ri_k + merge->nrecv;
752   nextcseqi  = nextrow + merge->nrecv;
753 
754   for (k=0; k<merge->nrecv; k++){
755     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
756     nrows = *(buf_ri_k[k]);
757     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
758     nextcseqi[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
759   }
760 
761   /* set values of ba */
762   for (i=0; i<C->m; i++) {
763     cseqrow = owners[rank] + i;
764     bj_i = bj+bi[i];  /* col indices of the i-th row of C */
765     bnzi = bi[i+1] - bi[i];
766     ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr);
767 
768     /* add local non-zero vals of this proc's C_seq into ba */
769     cseqnzi = cseqi[cseqrow+1] - cseqi[cseqrow];
770     cseqj_tmp = cseqj + cseqi[cseqrow];
771     cseqa_tmp = cseqa + cseqi[cseqrow];
772     nextcseqj = 0;
773     for (j=0; nextcseqj<cseqnzi; j++){
774       if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
775         ba_i[j] += cseqa_tmp[nextcseqj++];
776       }
777     }
778 
779     /* add received vals into ba */
780     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
781       /* i-th row */
782       if (i == *nextrow[k]) {
783         cseqnzi = *(nextcseqi[k]+1) - *nextcseqi[k];
784         cseqj_tmp   = buf_rj[k] + *(nextcseqi[k]);
785         cseqa_tmp   = abuf_r[k] + *(nextcseqi[k]);
786         nextcseqj = 0;
787         for (j=0; nextcseqj<cseqnzi; j++){
788           if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
789             ba_i[j] += cseqa_tmp[nextcseqj++];
790           }
791         }
792         nextrow[k]++; nextcseqi[k]++;
793       }
794     }
795     ierr = MatSetValues(C,1,&cseqrow,bnzi,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
796   }
797   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
798   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
799 
800   ierr = PetscFree(cseqa);CHKERRQ(ierr);
801   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
802   ierr = PetscFree(ba_i);CHKERRQ(ierr);
803   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
804 
805   PetscFunctionReturn(0);
806 }
807 
808 #ifdef TOBEREMOVED
809 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
810 #undef __FUNCT__
811 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
812 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
813 {
814   PetscErrorCode       ierr;
815   Mat_MatMatMultMPI    *ptap;
816   PetscObjectContainer container;
817 
818   PetscFunctionBegin;
819   ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
820   if (container) {
821     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
822     ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr);
823     ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr);
824     ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr);
825     ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr);
826 
827     ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
828     ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr);
829   }
830   ierr = PetscFree(ptap);CHKERRQ(ierr);
831 
832   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
833   PetscFunctionReturn(0);
834 }
835 
836 #undef __FUNCT__
837 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
838 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
839 {
840   PetscErrorCode       ierr;
841   Mat_MatMatMultMPI    *ptap;
842   PetscObjectContainer container;
843 
844   PetscFunctionBegin;
845   if (scall == MAT_INITIAL_MATRIX){
846     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
847     ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr);
848 
849     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
850     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
851 
852     /* get P_loc by taking all local rows of P */
853     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
854 
855     /* attach the supporting struct to P for reuse */
856     P->ops->destroy  = MatDestroy_MPIAIJ_PtAP;
857     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
858     ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr);
859     ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr);
860 
861     /* now, compute symbolic local P^T*A*P */
862     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */
863     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
864   } else if (scall == MAT_REUSE_MATRIX){
865     ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
866     if (container) {
867       ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
868     } else {
869       SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
870     }
871     /* update P_oth */
872     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
873 
874   } else {
875     SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall);
876   }
877   /* now, compute numeric local P^T*A*P */
878   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
879   ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
880   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
881 
882   PetscFunctionReturn(0);
883 }
884 /* Set symbolic info for i-th row of local product C=P^T*A*P */
885 #define MatPtAPSymbolicLocal_Private(ptM,pti,ptj,ci,cj) 0;\
886 {\
887   /* allocate ci array, arrays for fill computation and */\
888   /* free space for accumulating nonzero column info */\
889   ierr = PetscMalloc((ptM+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);\
890   ci[0] = 0;\
891 \
892   /* set initial free space to be nnz(A) scaled by fill*P->N/PtM. */\
893   /* this should be reasonable if sparsity of PtAP is similar to that of A. */\
894   nnz           = adi[am] + aoi[am];\
895   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/ptM+1),&free_space);\
896   current_space = free_space;\
897 \
898   ptJ=ptj;\
899   for (i=0; i<ptM; i++) {\
900     ptnzi  = pti[i+1] - pti[i];\
901     nprow_loc = 0; nprow_oth = 0;\
902     /* i-th row of symbolic Pt*A: */\
903     for (j=0;j<ptnzi;j++) {\
904       arow = *ptJ++;\
905       /* diagonal portion of A */\
906       anzj = adi[arow+1] - adi[arow];\
907       ajj  = adj + adi[arow];\
908       for (k=0;k<anzj;k++) {\
909         col = ajj[k]+prstart;\
910         if (!ptadenserow_loc[col]) {\
911           ptadenserow_loc[col]    = -1;\
912           ptasparserow_loc[nprow_loc++] = col;\
913         }\
914       }\
915       /* off-diagonal portion of A */\
916       anzj = aoi[arow+1] - aoi[arow];\
917       ajj  = aoj + aoi[arow];\
918       for (k=0;k<anzj;k++) {\
919         col = a->garray[ajj[k]];  /* global col */\
920         if (col < cstart){\
921           col = ajj[k];\
922         } else if (col >= cend){\
923           col = ajj[k] + pm;\
924         } else {\
925           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");\
926         }\
927         if (!ptadenserow_oth[col]) {\
928           ptadenserow_oth[col]    = -1;\
929           ptasparserow_oth[nprow_oth++] = col;\
930         }\
931       }\
932     }\
933     /* determine symbolic info for row of C_seq: */\
934     cnzi   = 0;\
935     /* rows of P_loc */\
936     ptaj = ptasparserow_loc;\
937     for (j=0; j<nprow_loc; j++) {\
938       prow = *ptaj++; \
939       prow -= prstart; /* rm */\
940       pnzj = pi_loc[prow+1] - pi_loc[prow];\
941       pjj  = pj_loc + pi_loc[prow];\
942       /* add non-zero cols of P into the sorted linked list lnk */\
943       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);\
944       cnzi += nlnk;\
945     }\
946     /* rows of P_oth */\
947     ptaj = ptasparserow_oth;\
948     for (j=0; j<nprow_oth; j++) {\
949       prow = *ptaj++; \
950       if (prow >= prend) prow -= pm; /* rm */\
951       pnzj = pi_oth[prow+1] - pi_oth[prow];\
952       pjj  = pj_oth + pi_oth[prow];\
953       /* add non-zero cols of P into the sorted linked list lnk */\
954       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);\
955       cnzi += nlnk;\
956     }\
957    \
958     /* If free space is not available, make more free space */\
959     /* Double the amount of total space in the list */\
960     if (current_space->local_remaining<cnzi) {\
961       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);\
962     }\
963 \
964     /* Copy data into free space, and zero out denserows */\
965     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);\
966     current_space->array           += cnzi;\
967     current_space->local_used      += cnzi;\
968     current_space->local_remaining -= cnzi;\
969    \
970     for (j=0;j<nprow_loc; j++) {\
971       ptadenserow_loc[ptasparserow_loc[j]] = 0;\
972     }\
973     for (j=0;j<nprow_oth; j++) {\
974       ptadenserow_oth[ptasparserow_oth[j]] = 0;\
975     }\
976     \
977     /* Aside: Perhaps we should save the pta info for the numerical factorization. */\
978     /*        For now, we will recompute what is needed. */ \
979     ci[i+1] = ci[i] + cnzi;\
980   }\
981   /* nnz is now stored in ci[ptm], column indices are in the list of free space */\
982   /* Allocate space for cj, initialize cj, and */\
983   /* destroy list of free space and other temporary array(s) */\
984   ierr = PetscMalloc((ci[ptM]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);\
985   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);\
986 }
987 #undef __FUNCT__
988 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
989 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
990 {
991   PetscErrorCode       ierr;
992   Mat                  P_loc,P_oth;
993   Mat_MatMatMultMPI    *ptap;
994   PetscObjectContainer container;
995   FreeSpaceList        free_space=PETSC_NULL,current_space;
996   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
997   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,
998                        *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data;
999   Mat_SeqAIJ           *p_loc,*p_oth;
1000   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*ajj,*pjj,*rmap=p->garray;
1001   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
1002   PetscInt             pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji,*ptaj;
1003   PetscInt             aN=A->N,am=A->m,pN=P->N;
1004   PetscInt             i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi,row;
1005   PetscBT              lnkbt;
1006   PetscInt             prstart,prend,nprow_loc,nprow_oth;
1007   PetscInt             *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
1008 
1009   MPI_Comm             comm=A->comm;
1010   Mat                  B_mpi;
1011   PetscMPIInt          size,rank,tagi,tagj,*len_s,*len_si,*len_ri;
1012   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
1013   PetscInt             len,proc,*dnz,*onz,*owners;
1014   PetscInt             anzi,*bi,*bj,bnzi,nspacedouble=0;
1015   PetscInt             nrows,tnrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1016   MPI_Request          *si_waits,*sj_waits,*ri_waits,*rj_waits;
1017   MPI_Status           *status;
1018   Mat_Merge_SeqsToMPI  *merge;
1019   PetscInt             *ptdi,*ptdj,*cdi,*cdj;
1020   PetscInt             *ptoi,*ptoj,*coi,*coj;
1021 
1022   PetscFunctionBegin;
1023   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
1024   if (container) {
1025     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
1026   } else {
1027     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
1028   }
1029 
1030   /* determine row ownership of C */
1031   /*---------------------------------------------------------*/
1032   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1033   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1034   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
1035 
1036   ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr);
1037   ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr);
1038   ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr);
1039   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
1040 
1041   /* get data from P_loc and P_oth */
1042   P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart;
1043   p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
1044   pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
1045   prend = prstart + pm;
1046 
1047   /* compute symbolic C_seq = P_loc^T * A_loc * P_seq */
1048   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
1049   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
1050   ptasparserow_loc = ptadenserow_loc + aN;
1051   ptadenserow_oth  = ptasparserow_loc + aN;
1052   ptasparserow_oth = ptadenserow_oth + aN;
1053 
1054   /* create and initialize a linked list */
1055   nlnk = pN+1;
1056   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1057 
1058   /* Pt = P_loc^T */
1059   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1060   ierr = MatPtAPSymbolicLocal_Private(pN,pti,ptj,ci,cj);CHKERRQ(ierr);
1061 
1062   /* Pt = Pd^T - diagonal portion of P_loc */
1063   ierr = MatGetSymbolicTranspose_SeqAIJ(p->A,&ptdi,&ptdj);CHKERRQ(ierr);
1064   ierr = MatPtAPSymbolicLocal_Private(pn,ptdi,ptdj,cdi,cdj);CHKERRQ(ierr);
1065   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&ptdi,&ptdj);CHKERRQ(ierr);
1066 #ifdef NEW
1067   /* Pt = Po^T - off-diagonal portion of P_loc */
1068   ierr = MatGetSymbolicTranspose_SeqAIJ(p->B,&ptoi,&ptoj);CHKERRQ(ierr);
1069   ierr = MatPtAPSymbolicLocal_Private(tnrows,ptoi,ptoj,coi,coj);CHKERRQ(ierr);
1070   ierr = MatRestoreSymbolicTranspose_SeqAIJ(p->A,&ptoi,&ptoj);CHKERRQ(ierr);
1071 #endif
1072   /* clean up */
1073   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
1074   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1075   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1076 
1077   /* determine the number of messages to send, their lengths */
1078   /*---------------------------------------------------------*/
1079   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
1080   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
1081   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
1082   len_s  = merge->len_s;
1083   len = 0;  /* max length of buf_si[] */
1084   merge->nsend = 0;
1085   tnrows = (p->B)->N; /* total num of rows to be sent to other processors */
1086   proc = 0;
1087   for (i=0; i<tnrows; i++){
1088     while (rmap[i] >= owners[proc+1]) proc++;
1089     len_si[proc]++;
1090   }
1091   for (proc=0; proc<size; proc++){
1092     len_s[proc] = 0;
1093     if (len_si[proc]){
1094       merge->nsend++;
1095       len_si[proc] = 2*(len_si[proc] + 1);
1096       len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of col indices to be sent to [proc] */
1097       len += len_si[proc];
1098     }
1099   }
1100 
1101   /* determine the number and length of messages to receive for ij-structure */
1102   /*-------------------------------------------------------------------------*/
1103   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
1104   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
1105 
1106   /* post the Irecv of j-structure */
1107   /*-------------------------------*/
1108   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr);
1109   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr);
1110 
1111   /* post the Isend of j-structure */
1112   /*--------------------------------*/
1113   ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr);
1114   sj_waits = si_waits + merge->nsend;
1115 
1116   for (proc=0, k=0; proc<size; proc++){
1117     if (!len_s[proc]) continue;
1118     i = owners[proc];
1119     ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr);
1120     k++;
1121   }
1122 
1123   /* receives and sends of j-structure are complete */
1124   /*------------------------------------------------*/
1125   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
1126   ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr);
1127   ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr);
1128 
1129   /* send and recv i-structure */
1130   /*---------------------------*/
1131   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr);
1132   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr);
1133 
1134   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
1135   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1136 
1137   j = 0; /* row index to be sent */
1138   for (proc=0,k=0; proc<size; proc++){
1139     if (!len_s[proc]) continue;
1140     /* form outgoing message for i-structure:
1141          buf_si[0]:                 nrows to be sent
1142                [1:nrows]:           row index (global)
1143                [nrows+1:2*nrows+1]: i-structure index
1144     */
1145     /*-------------------------------------------*/
1146     nrows = len_si[proc]/2 - 1;
1147     buf_si_i    = buf_si + nrows+1;
1148     buf_si[0]   = nrows;
1149     buf_si_i[0] = 0;
1150     for (i=0; i<nrows; i++){
1151       row = rmap[j++];
1152       anzi = ci[row+1] - ci[row];
1153       buf_si_i[i+1] = buf_si_i[i] + anzi; /* i-structure */
1154       buf_si[i+1]   = row - owners[proc]; /* local row index */
1155     }
1156     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr);
1157     k++;
1158     buf_si += len_si[proc];
1159   }
1160 
1161   ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr);
1162   ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr);
1163 
1164   ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr);
1165   for (i=0; i<merge->nrecv; i++){
1166     ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI:   recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr);
1167   }
1168 
1169   ierr = PetscFree(len_si);CHKERRQ(ierr);
1170   ierr = PetscFree(len_ri);CHKERRQ(ierr);
1171   ierr = PetscFree(rj_waits);CHKERRQ(ierr);
1172   ierr = PetscFree(si_waits);CHKERRQ(ierr);
1173   ierr = PetscFree(ri_waits);CHKERRQ(ierr);
1174   ierr = PetscFree(buf_s);CHKERRQ(ierr);
1175   ierr = PetscFree(status);CHKERRQ(ierr);
1176 
1177   /* compute a local seq matrix in each processor */
1178   /*----------------------------------------------*/
1179   /* allocate bi array and free space for accumulating nonzero column info */
1180   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
1181   bi[0] = 0;
1182 
1183   /* create and initialize a linked list */
1184   nlnk = pN+1;
1185   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1186 
1187   /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */
1188   len = 0;
1189   len  = ci[owners[rank+1]] - ci[owners[rank]];
1190   free_space=PETSC_NULL;
1191   ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr);
1192   current_space = free_space;
1193 
1194   /* determine symbolic info for each local row of C */
1195   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
1196   nextrow = buf_ri_k + merge->nrecv;
1197   nextci  = nextrow + merge->nrecv;
1198   for (k=0; k<merge->nrecv; k++){
1199     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1200     nrows = *buf_ri_k[k];
1201     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1202     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
1203   }
1204 
1205   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
1206   len = 0;
1207   for (i=0; i<pn; i++) {
1208     bnzi = 0;
1209     /* add local non-zero cols of this proc's C_seq into lnk */
1210     anzi = cdi[i+1] - cdi[i];
1211     cji  = cdj + cdi[i];
1212     ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1213     bnzi += nlnk;
1214     /* add received col data into lnk */
1215     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
1216       if (i == *nextrow[k]) { /* i-th row */
1217         anzi = *(nextci[k]+1) - *nextci[k];
1218         cji  = buf_rj[k] + *nextci[k];
1219         ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1220         bnzi += nlnk;
1221         nextrow[k]++; nextci[k]++;
1222       }
1223     }
1224     if (len < bnzi) len = bnzi;  /* =max(bnzi) */
1225 
1226     /* if free space is not available, make more free space */
1227     if (current_space->local_remaining<bnzi) {
1228       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1229       nspacedouble++;
1230     }
1231     /* copy data into free space, then initialize lnk */
1232     ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1233     ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr);
1234 
1235     current_space->array           += bnzi;
1236     current_space->local_used      += bnzi;
1237     current_space->local_remaining -= bnzi;
1238 
1239     bi[i+1] = bi[i] + bnzi;
1240   }
1241 
1242   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
1243 
1244   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
1245   ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
1246   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1247 
1248   /* create symbolic parallel matrix B_mpi */
1249   /*---------------------------------------*/
1250   ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr);
1251   ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr);
1252   ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr);
1253   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
1254 
1255   /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */
1256   B_mpi->assembled     = PETSC_FALSE;
1257   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_SeqsToMPI;
1258   merge->bi            = bi;
1259   merge->bj            = bj;
1260   merge->ci            = ci;
1261   merge->cj            = cj;
1262   merge->buf_ri        = buf_ri;
1263   merge->buf_rj        = buf_rj;
1264   merge->C_seq         = PETSC_NULL;
1265 
1266   /* attach the supporting struct to B_mpi for reuse */
1267   ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
1268   ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr);
1269   ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr);
1270   *C = B_mpi;
1271 
1272   ierr = PetscFree(cdi);CHKERRQ(ierr);
1273   ierr = PetscFree(cdj);CHKERRQ(ierr);
1274 #ifdef NEW
1275   ierr = PetscFree(coi);CHKERRQ(ierr);
1276   ierr = PetscFree(coj);CHKERRQ(ierr);
1277 #endif
1278   PetscFunctionReturn(0);
1279 }
1280 
1281 #undef __FUNCT__
1282 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
1283 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
1284 {
1285   PetscErrorCode       ierr;
1286   Mat_Merge_SeqsToMPI  *merge;
1287   Mat_MatMatMultMPI    *ptap;
1288   PetscObjectContainer cont_merge,cont_ptap;
1289   PetscInt             flops=0;
1290   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
1291   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1292   Mat_SeqAIJ           *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data,*p_oth;
1293   Mat_SeqAIJ           *cd=(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data;
1294   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense;
1295   PetscInt             *pi_oth,*pj_oth,*pJ_d=pd->j,*pJ_o=po->j,*pjj;
1296   PetscInt             i,j,k,nzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1297   MatScalar            *ada=ad->a,*aoa=ao->a,*apa,*paj,*cseqa,*caj,**abuf_r,*ba_i,*ca;
1298   MatScalar            *pA_d=pd->a,*pA_o=po->a,*pa_oth;
1299   PetscInt             am=A->m,cN=C->N,cm=C->m;
1300   MPI_Comm             comm=C->comm;
1301   PetscMPIInt          size,rank,taga,*len_s,**buf_ri,**buf_rj,**buf_ri_k,**nextrow,**nextcseqi;
1302   PetscInt             *owners,proc,nrows;
1303   PetscInt             *cjj,cseqnzi,*bj_i,*bi,*bj,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */
1304   MPI_Request          *s_waits,*r_waits;
1305   MPI_Status           *status;
1306   PetscInt             *cseqi,*cseqj,col;
1307   PetscInt             stages[3];
1308 
1309   PetscFunctionBegin;
1310   ierr = PetscLogStageRegister(&stages[0],"NumAP_local");CHKERRQ(ierr);
1311   ierr = PetscLogStageRegister(&stages[1],"NumPtAP_local");CHKERRQ(ierr);
1312   ierr = PetscLogStageRegister(&stages[2],"NumPtAP_Comm");CHKERRQ(ierr);
1313 
1314   ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr);
1315   if (cont_merge) {
1316     ierr  = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr);
1317   } else {
1318     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
1319   }
1320   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr);
1321   if (cont_ptap) {
1322     ierr  = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr);
1323   } else {
1324     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
1325   }
1326   /* get data from symbolic products */
1327   p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data;
1328   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
1329 
1330   cseqi = merge->ci; cseqj=merge->cj;
1331   ierr  = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr);
1332   ierr  = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
1333 
1334   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1335   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
1336   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
1337 
1338   /* Allocate temporary array for storage of one row of A*P and one row of C */
1339   ierr = PetscMalloc(2*cN*(sizeof(MatScalar)+sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1340   ierr = PetscMemzero(apa,2*cN*(sizeof(MatScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
1341   ca       = (MatScalar*)(apa + cN);
1342   apj      = (PetscInt *)(ca + cN);
1343   apjdense = (PetscInt *)(apj + cN);
1344 
1345   /* Clear old values in C */
1346   ierr = PetscMemzero(cd->a,cd->i[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1347   ierr = PetscMemzero(co->a,co->i[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1348 
1349   for (i=0;i<am;i++) {
1350     ierr = PetscLogStagePush(stages[0]);CHKERRQ(ierr);
1351     /* Form i-th sparse row of A*P */
1352      apnzj = 0;
1353     /* diagonal portion of A */
1354     nzi  = adi[i+1] - adi[i];
1355     for (j=0;j<nzi;j++) {
1356       prow = *adj++;
1357       /* diagonal portion of P */
1358       pnzj = pd->i[prow+1] - pd->i[prow];
1359       pjj  = pd->j + pd->i[prow]; /* local col index of P */
1360       paj  = pd->a + pd->i[prow];
1361       for (k=0;k<pnzj;k++) {
1362         col = *pjj + p->cstart; pjj++; /* global col index of P */
1363         if (!apjdense[col]) {
1364           apjdense[col] = -1;
1365           apj[apnzj++]  = col;
1366         }
1367         apa[col] += (*ada)*paj[k];
1368       }
1369       flops += 2*pnzj;
1370       /* off-diagonal portion of P */
1371       pnzj = po->i[prow+1] - po->i[prow];
1372       pjj  = po->j + po->i[prow]; /* local col index of P */
1373       paj  = po->a + po->i[prow];
1374       for (k=0;k<pnzj;k++) {
1375         col = p->garray[*pjj]; pjj++; /* global col index of P */
1376         if (!apjdense[col]) {
1377           apjdense[col] = -1;
1378           apj[apnzj++]  = col;
1379         }
1380         apa[col] += (*ada)*paj[k];
1381       }
1382       flops += 2*pnzj;
1383 
1384       ada++;
1385     }
1386     /* off-diagonal portion of A */
1387     nzi  = aoi[i+1] - aoi[i];
1388     for (j=0;j<nzi;j++) {
1389       prow = *aoj++;
1390       pnzj = pi_oth[prow+1] - pi_oth[prow];
1391       pjj  = pj_oth + pi_oth[prow];
1392       paj  = pa_oth + pi_oth[prow];
1393       for (k=0;k<pnzj;k++) {
1394         if (!apjdense[pjj[k]]) {
1395           apjdense[pjj[k]] = -1;
1396           apj[apnzj++]     = pjj[k];
1397         }
1398         apa[pjj[k]] += (*aoa)*paj[k];
1399       }
1400       flops += 2*pnzj;
1401       aoa++;
1402     }
1403     /* Sort the j index array for quick sparse axpy. */
1404     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1405 
1406     ierr = PetscLogStagePop();
1407     ierr = PetscLogStagePush(stages[1]);
1408     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
1409     /* diagonal portion of P -- gives matrix value of local C */
1410     pnzi = pd->i[i+1] - pd->i[i];
1411     for (j=0;j<pnzi;j++) {
1412       /* add value into C */
1413       for (k=0; k<apnzj; k++) ca[k] = (*pA_d)*apa[apj[k]];
1414       crow = (*pJ_d++) + owners[rank];
1415       ierr = MatSetValues(C,1,&crow,apnzj,apj,ca,ADD_VALUES);CHKERRQ(ierr);
1416       ierr = PetscMemzero(ca,apnzj*(sizeof(MatScalar)));CHKERRQ(ierr);
1417       pA_d++;
1418       flops += 2*apnzj;
1419     }
1420 
1421     /* off-diagonal portion of P -- gives matrix values to be sent to others */
1422     pnzi = po->i[i+1] - po->i[i];
1423     for (j=0;j<pnzi;j++) {
1424       crow   = p->garray[*pJ_o++];
1425       cjj    = cseqj + cseqi[crow];
1426       caj    = cseqa + cseqi[crow];
1427       /* add value into C_seq to be sent to other processors */
1428       nextap = 0;
1429       for (k=0;nextap<apnzj;k++) {
1430         if (cjj[k]==apj[nextap]) {
1431           caj[k] += (*pA_o)*apa[apj[nextap++]];
1432         }
1433       }
1434       flops += 2*apnzj;
1435       pA_o++;
1436     }
1437 
1438     /* Zero the current row info for A*P */
1439     for (j=0;j<apnzj;j++) {
1440       apa[apj[j]]      = 0.;
1441       apjdense[apj[j]] = 0;
1442     }
1443     ierr = PetscLogStagePop();
1444   }
1445 
1446   /* send and recv matrix values */
1447   /*-----------------------------*/
1448   ierr = PetscLogStagePush(stages[2]);CHKERRQ(ierr);
1449   bi     = merge->bi;
1450   bj     = merge->bj;
1451   buf_ri = merge->buf_ri;
1452   buf_rj = merge->buf_rj;
1453   len_s  = merge->len_s;
1454   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr);
1455   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
1456 
1457   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr);
1458   for (proc=0,k=0; proc<size; proc++){
1459     if (!len_s[proc]) continue;
1460     i = owners[proc];
1461     ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
1462     k++;
1463   }
1464   ierr   = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
1465   ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);
1466   ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);
1467   ierr = PetscFree(status);CHKERRQ(ierr);
1468 
1469   ierr = PetscFree(s_waits);CHKERRQ(ierr);
1470   ierr = PetscFree(r_waits);CHKERRQ(ierr);
1471   ierr = PetscFree(cseqa);CHKERRQ(ierr);
1472 
1473   /* insert mat values of mpimat */
1474   /*----------------------------*/
1475   ba_i = apa; /* rename, temporary array for storage of one row of C */
1476   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
1477   nextrow = buf_ri_k + merge->nrecv;
1478   nextcseqi = nextrow + merge->nrecv;
1479 
1480   for (k=0; k<merge->nrecv; k++){
1481     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1482     nrows        = *(buf_ri_k[k]);
1483     nextrow[k]   = buf_ri_k[k]+1;  /* next row index of k-th recved i-structure */
1484     nextcseqi[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
1485   }
1486 
1487   /* add received local vals to C */
1488   for (i=0; i<cm; i++) {
1489     crow = owners[rank] + i;
1490     bj_i = bj+bi[i];  /* col indices of the i-th row of C */
1491     bnzi = bi[i+1] - bi[i];
1492     nzi = 0;
1493     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
1494       /* i-th row */
1495       if (i == *nextrow[k]) {
1496         cseqnzi = *(nextcseqi[k]+1) - *nextcseqi[k];
1497         cseqj   = buf_rj[k] + *(nextcseqi[k]);
1498         cseqa   = abuf_r[k] + *(nextcseqi[k]);
1499         nextcseqj = 0;
1500         for (j=0; nextcseqj<cseqnzi; j++){
1501           if (*(bj_i + j) == cseqj[nextcseqj]){ /* bcol == cseqcol */
1502             ba_i[j] += cseqa[nextcseqj++];
1503             nzi++;
1504           }
1505         }
1506         nextrow[k]++; nextcseqi[k]++;
1507       }
1508     }
1509     if (nzi>0){
1510       ierr = MatSetValues(C,1,&crow,bnzi,bj_i,ba_i,ADD_VALUES);CHKERRQ(ierr);
1511       ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr);
1512       flops += 2*nzi;
1513     }
1514   }
1515   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1516   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1517   ierr = PetscLogStagePop();CHKERRQ(ierr);
1518 
1519   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
1520   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
1521   ierr = PetscFree(apa);CHKERRQ(ierr);
1522   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1523 
1524   PetscFunctionReturn(0);
1525 }
1526 #endif /* TOBEREMOVED */
1527 
1528 #undef __FUNCT__
1529 #define __FUNCT__ "MatPtAP_SeqAIJ_SeqAIJ"
1530 PetscErrorCode MatPtAP_SeqAIJ_SeqAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
1531 {
1532   PetscErrorCode ierr;
1533 
1534   PetscFunctionBegin;
1535   if (scall == MAT_INITIAL_MATRIX){
1536     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
1537     ierr = MatPtAPSymbolic_SeqAIJ_SeqAIJ(A,P,fill,C);CHKERRQ(ierr);
1538     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
1539   }
1540   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1541   ierr = MatPtAPNumeric_SeqAIJ_SeqAIJ(A,P,*C);CHKERRQ(ierr);
1542   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1543   PetscFunctionReturn(0);
1544 }
1545 
1546 /*
1547    MatPtAPSymbolic - Creates the (i,j) structure of the matrix projection C = P^T * A * P
1548 
1549    Collective on Mat
1550 
1551    Input Parameters:
1552 +  A - the matrix
1553 -  P - the projection matrix
1554 
1555    Output Parameters:
1556 .  C - the (i,j) structure of the product matrix
1557 
1558    Notes:
1559    C will be created and must be destroyed by the user with MatDestroy().
1560 
1561    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
1562    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.  The product is computed using
1563    this (i,j) structure by calling MatPtAPNumeric().
1564 
1565    Level: intermediate
1566 
1567 .seealso: MatPtAP(),MatPtAPNumeric(),MatMatMultSymbolic()
1568 */
1569 #undef __FUNCT__
1570 #define __FUNCT__ "MatPtAPSymbolic"
1571 PetscErrorCode MatPtAPSymbolic(Mat A,Mat P,PetscReal fill,Mat *C)
1572 {
1573   PetscErrorCode ierr;
1574   PetscErrorCode (*fA)(Mat,Mat,PetscReal,Mat*);
1575   PetscErrorCode (*fP)(Mat,Mat,PetscReal,Mat*);
1576 
1577   PetscFunctionBegin;
1578 
1579   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
1580   PetscValidType(A,1);
1581   MatPreallocated(A);
1582   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1583   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1584 
1585   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
1586   PetscValidType(P,2);
1587   MatPreallocated(P);
1588   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1589   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1590 
1591   PetscValidPointer(C,3);
1592 
1593   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
1594   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
1595 
1596   /* For now, we do not dispatch based on the type of A and P */
1597   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
1598   fA = A->ops->ptapsymbolic;
1599   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for A of type %s",A->type_name);
1600   fP = P->ops->ptapsymbolic;
1601   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for P of type %s",P->type_name);
1602   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPSymbolic requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
1603 
1604   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
1605   ierr = (*fA)(A,P,fill,C);CHKERRQ(ierr);
1606   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
1607 
1608   PetscFunctionReturn(0);
1609 }
1610 
1611 typedef struct {
1612   Mat    symAP;
1613 } Mat_PtAPstruct;
1614 
1615 EXTERN PetscErrorCode MatDestroy_SeqAIJ(Mat);
1616 
1617 #undef __FUNCT__
1618 #define __FUNCT__ "MatDestroy_SeqAIJ_PtAP"
1619 PetscErrorCode MatDestroy_SeqAIJ_PtAP(Mat A)
1620 {
1621   PetscErrorCode    ierr;
1622   Mat_PtAPstruct    *ptap=(Mat_PtAPstruct*)A->spptr;
1623 
1624   PetscFunctionBegin;
1625   ierr = MatDestroy(ptap->symAP);CHKERRQ(ierr);
1626   ierr = PetscFree(ptap);CHKERRQ(ierr);
1627   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1628   PetscFunctionReturn(0);
1629 }
1630 
1631 #undef __FUNCT__
1632 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqAIJ"
1633 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
1634 {
1635   PetscErrorCode ierr;
1636   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1637   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*p = (Mat_SeqAIJ*)P->data,*c;
1638   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
1639   PetscInt       *ci,*cj,*ptadenserow,*ptasparserow,*ptaj;
1640   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M;
1641   PetscInt       i,j,k,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi,nlnk,*lnk;
1642   MatScalar      *ca;
1643   PetscBT        lnkbt;
1644 
1645   PetscFunctionBegin;
1646   /* Get ij structure of P^T */
1647   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1648   ptJ=ptj;
1649 
1650   /* Allocate ci array, arrays for fill computation and */
1651   /* free space for accumulating nonzero column info */
1652   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1653   ci[0] = 0;
1654 
1655   ierr = PetscMalloc((2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
1656   ierr = PetscMemzero(ptadenserow,(2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
1657   ptasparserow = ptadenserow  + an;
1658 
1659   /* create and initialize a linked list */
1660   nlnk = pn+1;
1661   ierr = PetscLLCreate(pn,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1662 
1663   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
1664   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1665   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
1666   current_space = free_space;
1667 
1668   /* Determine symbolic info for each row of C: */
1669   for (i=0;i<pn;i++) {
1670     ptnzi  = pti[i+1] - pti[i];
1671     ptanzi = 0;
1672     /* Determine symbolic row of PtA: */
1673     for (j=0;j<ptnzi;j++) {
1674       arow = *ptJ++;
1675       anzj = ai[arow+1] - ai[arow];
1676       ajj  = aj + ai[arow];
1677       for (k=0;k<anzj;k++) {
1678         if (!ptadenserow[ajj[k]]) {
1679           ptadenserow[ajj[k]]    = -1;
1680           ptasparserow[ptanzi++] = ajj[k];
1681         }
1682       }
1683     }
1684       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
1685     ptaj = ptasparserow;
1686     cnzi   = 0;
1687     for (j=0;j<ptanzi;j++) {
1688       prow = *ptaj++;
1689       pnzj = pi[prow+1] - pi[prow];
1690       pjj  = pj + pi[prow];
1691       /* add non-zero cols of P into the sorted linked list lnk */
1692       ierr = PetscLLAdd(pnzj,pjj,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1693       cnzi += nlnk;
1694     }
1695 
1696     /* If free space is not available, make more free space */
1697     /* Double the amount of total space in the list */
1698     if (current_space->local_remaining<cnzi) {
1699       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1700     }
1701 
1702     /* Copy data into free space, and zero out denserows */
1703     ierr = PetscLLClean(pn,pn,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1704     current_space->array           += cnzi;
1705     current_space->local_used      += cnzi;
1706     current_space->local_remaining -= cnzi;
1707 
1708     for (j=0;j<ptanzi;j++) {
1709       ptadenserow[ptasparserow[j]] = 0;
1710     }
1711     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1712     /*        For now, we will recompute what is needed. */
1713     ci[i+1] = ci[i] + cnzi;
1714   }
1715   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1716   /* Allocate space for cj, initialize cj, and */
1717   /* destroy list of free space and other temporary array(s) */
1718   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1719   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1720   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
1721   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1722 
1723   /* Allocate space for ca */
1724   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1725   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1726 
1727   /* put together the new matrix */
1728   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
1729 
1730   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1731   /* Since these are PETSc arrays, change flags to free them as necessary. */
1732   c = (Mat_SeqAIJ *)((*C)->data);
1733   c->freedata = PETSC_TRUE;
1734   c->nonew    = 0;
1735 
1736   /* Clean up. */
1737   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1738 
1739   PetscFunctionReturn(0);
1740 }
1741 
1742 #include "src/mat/impls/maij/maij.h"
1743 EXTERN_C_BEGIN
1744 #undef __FUNCT__
1745 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqMAIJ"
1746 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqMAIJ(Mat A,Mat PP,Mat *C)
1747 {
1748   /* This routine requires testing -- I don't think it works. */
1749   PetscErrorCode ierr;
1750   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1751   Mat_SeqMAIJ    *pp=(Mat_SeqMAIJ*)PP->data;
1752   Mat            P=pp->AIJ;
1753   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*p=(Mat_SeqAIJ*)P->data,*c;
1754   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
1755   PetscInt       *ci,*cj,*denserow,*sparserow,*ptadenserow,*ptasparserow,*ptaj;
1756   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M,ppdof=pp->dof;
1757   PetscInt       i,j,k,dof,pdof,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi;
1758   MatScalar      *ca;
1759 
1760   PetscFunctionBegin;
1761   /* Start timer */
1762   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1763 
1764   /* Get ij structure of P^T */
1765   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1766 
1767   /* Allocate ci array, arrays for fill computation and */
1768   /* free space for accumulating nonzero column info */
1769   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1770   ci[0] = 0;
1771 
1772   ierr = PetscMalloc((2*pn+2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
1773   ierr = PetscMemzero(ptadenserow,(2*pn+2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
1774   ptasparserow = ptadenserow  + an;
1775   denserow     = ptasparserow + an;
1776   sparserow    = denserow     + pn;
1777 
1778   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
1779   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1780   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
1781   current_space = free_space;
1782 
1783   /* Determine symbolic info for each row of C: */
1784   for (i=0;i<pn/ppdof;i++) {
1785     ptnzi  = pti[i+1] - pti[i];
1786     ptanzi = 0;
1787     ptJ    = ptj + pti[i];
1788     for (dof=0;dof<ppdof;dof++) {
1789     /* Determine symbolic row of PtA: */
1790       for (j=0;j<ptnzi;j++) {
1791         arow = ptJ[j] + dof;
1792         anzj = ai[arow+1] - ai[arow];
1793         ajj  = aj + ai[arow];
1794         for (k=0;k<anzj;k++) {
1795           if (!ptadenserow[ajj[k]]) {
1796             ptadenserow[ajj[k]]    = -1;
1797             ptasparserow[ptanzi++] = ajj[k];
1798           }
1799         }
1800       }
1801       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
1802       ptaj = ptasparserow;
1803       cnzi   = 0;
1804       for (j=0;j<ptanzi;j++) {
1805         pdof = *ptaj%dof;
1806         prow = (*ptaj++)/dof;
1807         pnzj = pi[prow+1] - pi[prow];
1808         pjj  = pj + pi[prow];
1809         for (k=0;k<pnzj;k++) {
1810           if (!denserow[pjj[k]+pdof]) {
1811             denserow[pjj[k]+pdof] = -1;
1812             sparserow[cnzi++]     = pjj[k]+pdof;
1813           }
1814         }
1815       }
1816 
1817       /* sort sparserow */
1818       ierr = PetscSortInt(cnzi,sparserow);CHKERRQ(ierr);
1819 
1820       /* If free space is not available, make more free space */
1821       /* Double the amount of total space in the list */
1822       if (current_space->local_remaining<cnzi) {
1823         ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1824       }
1825 
1826       /* Copy data into free space, and zero out denserows */
1827       ierr = PetscMemcpy(current_space->array,sparserow,cnzi*sizeof(PetscInt));CHKERRQ(ierr);
1828       current_space->array           += cnzi;
1829       current_space->local_used      += cnzi;
1830       current_space->local_remaining -= cnzi;
1831 
1832       for (j=0;j<ptanzi;j++) {
1833         ptadenserow[ptasparserow[j]] = 0;
1834       }
1835       for (j=0;j<cnzi;j++) {
1836         denserow[sparserow[j]] = 0;
1837       }
1838       /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1839       /*        For now, we will recompute what is needed. */
1840       ci[i+1+dof] = ci[i+dof] + cnzi;
1841     }
1842   }
1843   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1844   /* Allocate space for cj, initialize cj, and */
1845   /* destroy list of free space and other temporary array(s) */
1846   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1847   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1848   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
1849 
1850   /* Allocate space for ca */
1851   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1852   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1853 
1854   /* put together the new matrix */
1855   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
1856 
1857   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1858   /* Since these are PETSc arrays, change flags to free them as necessary. */
1859   c = (Mat_SeqAIJ *)((*C)->data);
1860   c->freedata = PETSC_TRUE;
1861   c->nonew    = 0;
1862 
1863   /* Clean up. */
1864   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1865 
1866   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1867   PetscFunctionReturn(0);
1868 }
1869 EXTERN_C_END
1870 
1871 /*
1872    MatPtAPNumeric - Computes the matrix projection C = P^T * A * P
1873 
1874    Collective on Mat
1875 
1876    Input Parameters:
1877 +  A - the matrix
1878 -  P - the projection matrix
1879 
1880    Output Parameters:
1881 .  C - the product matrix
1882 
1883    Notes:
1884    C must have been created by calling MatPtAPSymbolic and must be destroyed by
1885    the user using MatDeatroy().
1886 
1887    This routine is currently only implemented for pairs of AIJ matrices and classes
1888    which inherit from AIJ.  C will be of type MATAIJ.
1889 
1890    Level: intermediate
1891 
1892 .seealso: MatPtAP(),MatPtAPSymbolic(),MatMatMultNumeric()
1893 */
1894 #undef __FUNCT__
1895 #define __FUNCT__ "MatPtAPNumeric"
1896 PetscErrorCode MatPtAPNumeric(Mat A,Mat P,Mat C)
1897 {
1898   PetscErrorCode ierr;
1899   PetscErrorCode (*fA)(Mat,Mat,Mat);
1900   PetscErrorCode (*fP)(Mat,Mat,Mat);
1901 
1902   PetscFunctionBegin;
1903 
1904   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
1905   PetscValidType(A,1);
1906   MatPreallocated(A);
1907   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1908   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1909 
1910   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
1911   PetscValidType(P,2);
1912   MatPreallocated(P);
1913   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1914   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1915 
1916   PetscValidHeaderSpecific(C,MAT_COOKIE,3);
1917   PetscValidType(C,3);
1918   MatPreallocated(C);
1919   if (!C->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1920   if (C->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1921 
1922   if (P->N!=C->M) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->M);
1923   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
1924   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
1925   if (P->N!=C->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->N);
1926 
1927   /* For now, we do not dispatch based on the type of A and P */
1928   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
1929   fA = A->ops->ptapnumeric;
1930   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for A of type %s",A->type_name);
1931   fP = P->ops->ptapnumeric;
1932   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for P of type %s",P->type_name);
1933   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPNumeric requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
1934 
1935   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1936   ierr = (*fA)(A,P,C);CHKERRQ(ierr);
1937   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1938 
1939   PetscFunctionReturn(0);
1940 }
1941 
1942 #undef __FUNCT__
1943 #define __FUNCT__ "MatPtAPNumeric_SeqAIJ_SeqAIJ"
1944 PetscErrorCode MatPtAPNumeric_SeqAIJ_SeqAIJ(Mat A,Mat P,Mat C)
1945 {
1946   PetscErrorCode ierr;
1947   PetscInt       flops=0;
1948   Mat_SeqAIJ     *a  = (Mat_SeqAIJ *) A->data;
1949   Mat_SeqAIJ     *p  = (Mat_SeqAIJ *) P->data;
1950   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
1951   PetscInt       *ai=a->i,*aj=a->j,*apj,*apjdense,*pi=p->i,*pj=p->j,*pJ=p->j,*pjj;
1952   PetscInt       *ci=c->i,*cj=c->j,*cjj;
1953   PetscInt       am=A->M,cn=C->N,cm=C->M;
1954   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1955   MatScalar      *aa=a->a,*apa,*pa=p->a,*pA=p->a,*paj,*ca=c->a,*caj;
1956 
1957   PetscFunctionBegin;
1958   /* Allocate temporary array for storage of one row of A*P */
1959   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1960   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
1961 
1962   apj      = (PetscInt *)(apa + cn);
1963   apjdense = apj + cn;
1964 
1965   /* Clear old values in C */
1966   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1967 
1968   for (i=0;i<am;i++) {
1969     /* Form sparse row of A*P */
1970     anzi  = ai[i+1] - ai[i];
1971     apnzj = 0;
1972     for (j=0;j<anzi;j++) {
1973       prow = *aj++;
1974       pnzj = pi[prow+1] - pi[prow];
1975       pjj  = pj + pi[prow];
1976       paj  = pa + pi[prow];
1977       for (k=0;k<pnzj;k++) {
1978         if (!apjdense[pjj[k]]) {
1979           apjdense[pjj[k]] = -1;
1980           apj[apnzj++]     = pjj[k];
1981         }
1982         apa[pjj[k]] += (*aa)*paj[k];
1983       }
1984       flops += 2*pnzj;
1985       aa++;
1986     }
1987 
1988     /* Sort the j index array for quick sparse axpy. */
1989     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1990 
1991     /* Compute P^T*A*P using outer product (P^T)[:,j]*(A*P)[j,:]. */
1992     pnzi = pi[i+1] - pi[i];
1993     for (j=0;j<pnzi;j++) {
1994       nextap = 0;
1995       crow   = *pJ++;
1996       cjj    = cj + ci[crow];
1997       caj    = ca + ci[crow];
1998       /* Perform sparse axpy operation.  Note cjj includes apj. */
1999       for (k=0;nextap<apnzj;k++) {
2000         if (cjj[k]==apj[nextap]) {
2001           caj[k] += (*pA)*apa[apj[nextap++]];
2002         }
2003       }
2004       flops += 2*apnzj;
2005       pA++;
2006     }
2007 
2008     /* Zero the current row info for A*P */
2009     for (j=0;j<apnzj;j++) {
2010       apa[apj[j]]      = 0.;
2011       apjdense[apj[j]] = 0;
2012     }
2013   }
2014 
2015   /* Assemble the final matrix and clean up */
2016   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2017   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2018   ierr = PetscFree(apa);CHKERRQ(ierr);
2019   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
2020 
2021   PetscFunctionReturn(0);
2022 }
2023 
2024 /* Compute local C = P_loc^T * A * P - used by MatPtAP_MPIAIJ_MPIAIJ() */
2025 static PetscEvent logkey_matptapnumeric_local = 0;
2026 #undef __FUNCT__
2027 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ_Local"
2028 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,Mat C)
2029 {
2030   PetscErrorCode ierr;
2031   PetscInt       flops=0;
2032   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
2033   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
2034   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
2035   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
2036   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
2037   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col;
2038   PetscInt       *pJ=pj_loc,*pjj;
2039   PetscInt       *ci=c->i,*cj=c->j,*cjj;
2040   PetscInt       am=A->m,cn=C->N,cm=C->M;
2041   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
2042   MatScalar      *ada=ad->a,*aoa=ao->a,*apa,*paj,*ca=c->a,*caj;
2043   MatScalar      *pa_loc=p_loc->a,*pA=pa_loc,*pa_oth=p_oth->a;
2044 
2045   PetscFunctionBegin;
2046   if (!logkey_matptapnumeric_local) {
2047     ierr = PetscLogEventRegister(&logkey_matptapnumeric_local,"MatPtAPNumeric_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);CHKERRQ(ierr);
2048   }
2049   ierr = PetscLogEventBegin(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
2050 
2051   /* Allocate temporary array for storage of one row of A*P */
2052   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
2053   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
2054   apj      = (PetscInt *)(apa + cn);
2055   apjdense = apj + cn;
2056 
2057   /* Clear old values in C */
2058   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
2059 
2060   for (i=0;i<am;i++) {
2061     /* Form i-th sparse row of A*P */
2062      apnzj = 0;
2063     /* diagonal portion of A */
2064     anzi  = adi[i+1] - adi[i];
2065     for (j=0;j<anzi;j++) {
2066       prow = *adj;
2067       adj++;
2068       pnzj = pi_loc[prow+1] - pi_loc[prow];
2069       pjj  = pj_loc + pi_loc[prow];
2070       paj  = pa_loc + pi_loc[prow];
2071       for (k=0;k<pnzj;k++) {
2072         if (!apjdense[pjj[k]]) {
2073           apjdense[pjj[k]] = -1;
2074           apj[apnzj++]     = pjj[k];
2075         }
2076         apa[pjj[k]] += (*ada)*paj[k];
2077       }
2078       flops += 2*pnzj;
2079       ada++;
2080     }
2081     /* off-diagonal portion of A */
2082     anzi  = aoi[i+1] - aoi[i];
2083     for (j=0;j<anzi;j++) {
2084       col = a->garray[*aoj];
2085       if (col < cstart){
2086         prow = *aoj;
2087       } else if (col >= cend){
2088         prow = *aoj;
2089       } else {
2090         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
2091       }
2092       aoj++;
2093       pnzj = pi_oth[prow+1] - pi_oth[prow];
2094       pjj  = pj_oth + pi_oth[prow];
2095       paj  = pa_oth + pi_oth[prow];
2096       for (k=0;k<pnzj;k++) {
2097         if (!apjdense[pjj[k]]) {
2098           apjdense[pjj[k]] = -1;
2099           apj[apnzj++]     = pjj[k];
2100         }
2101         apa[pjj[k]] += (*aoa)*paj[k];
2102       }
2103       flops += 2*pnzj;
2104       aoa++;
2105     }
2106     /* Sort the j index array for quick sparse axpy. */
2107     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
2108 
2109     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
2110     pnzi = pi_loc[i+1] - pi_loc[i];
2111     for (j=0;j<pnzi;j++) {
2112       nextap = 0;
2113       crow   = *pJ++;
2114       cjj    = cj + ci[crow];
2115       caj    = ca + ci[crow];
2116       /* Perform sparse axpy operation.  Note cjj includes apj. */
2117       for (k=0;nextap<apnzj;k++) {
2118         if (cjj[k]==apj[nextap]) {
2119           caj[k] += (*pA)*apa[apj[nextap++]];
2120         }
2121       }
2122       flops += 2*apnzj;
2123       pA++;
2124     }
2125 
2126     /* Zero the current row info for A*P */
2127     for (j=0;j<apnzj;j++) {
2128       apa[apj[j]]      = 0.;
2129       apjdense[apj[j]] = 0;
2130     }
2131   }
2132 
2133   /* Assemble the final matrix and clean up */
2134   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2135   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2136   ierr = PetscFree(apa);CHKERRQ(ierr);
2137   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
2138   ierr = PetscLogEventEnd(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
2139   PetscFunctionReturn(0);
2140 }
2141 static PetscEvent logkey_matptapsymbolic_local = 0;
2142 #undef __FUNCT__
2143 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local"
2144 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,PetscReal fill,Mat *C)
2145 {
2146   PetscErrorCode ierr;
2147   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
2148   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
2149   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*c;
2150   PetscInt       *pti,*ptj,*ptJ,*ajj,*pjj;
2151   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
2152   PetscInt       *ci,*cj,*ptaj;
2153   PetscInt       aN=A->N,am=A->m,pN=P_loc->N;
2154   PetscInt       i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi;
2155   PetscInt       pm=P_loc->m,nlnk,*lnk;
2156   MatScalar      *ca;
2157   PetscBT        lnkbt;
2158   PetscInt       prend,nprow_loc,nprow_oth;
2159   PetscInt       *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
2160   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
2161   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
2162 
2163   PetscFunctionBegin;
2164   if (!logkey_matptapsymbolic_local) {
2165     ierr = PetscLogEventRegister(&logkey_matptapsymbolic_local,"MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);
2166   }
2167   ierr = PetscLogEventBegin(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
2168 
2169   prend = prstart + pm;
2170 
2171   /* get ij structure of P_loc^T */
2172   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
2173   ptJ=ptj;
2174 
2175   /* Allocate ci array, arrays for fill computation and */
2176   /* free space for accumulating nonzero column info */
2177   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
2178   ci[0] = 0;
2179 
2180   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
2181   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
2182   ptasparserow_loc = ptadenserow_loc + aN;
2183   ptadenserow_oth  = ptasparserow_loc + aN;
2184   ptasparserow_oth = ptadenserow_oth + aN;
2185 
2186   /* create and initialize a linked list */
2187   nlnk = pN+1;
2188   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
2189 
2190   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
2191   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
2192   nnz           = adi[am] + aoi[am];
2193   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
2194   current_space = free_space;
2195 
2196   /* determine symbolic info for each row of C: */
2197   for (i=0;i<pN;i++) {
2198     ptnzi  = pti[i+1] - pti[i];
2199     nprow_loc = 0; nprow_oth = 0;
2200     /* i-th row of symbolic P_loc^T*A_loc: */
2201     for (j=0;j<ptnzi;j++) {
2202       arow = *ptJ++;
2203       /* diagonal portion of A */
2204       anzj = adi[arow+1] - adi[arow];
2205       ajj  = adj + adi[arow];
2206       for (k=0;k<anzj;k++) {
2207         col = ajj[k]+prstart;
2208         if (!ptadenserow_loc[col]) {
2209           ptadenserow_loc[col]    = -1;
2210           ptasparserow_loc[nprow_loc++] = col;
2211         }
2212       }
2213       /* off-diagonal portion of A */
2214       anzj = aoi[arow+1] - aoi[arow];
2215       ajj  = aoj + aoi[arow];
2216       for (k=0;k<anzj;k++) {
2217         col = a->garray[ajj[k]];  /* global col */
2218         if (col < cstart){
2219           col = ajj[k];
2220         } else if (col >= cend){
2221           col = ajj[k] + pm;
2222         } else {
2223           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
2224         }
2225         if (!ptadenserow_oth[col]) {
2226           ptadenserow_oth[col]    = -1;
2227           ptasparserow_oth[nprow_oth++] = col;
2228         }
2229       }
2230     }
2231 
2232     /* using symbolic info of local PtA, determine symbolic info for row of C: */
2233     cnzi   = 0;
2234     /* rows of P_loc */
2235     ptaj = ptasparserow_loc;
2236     for (j=0; j<nprow_loc; j++) {
2237       prow = *ptaj++;
2238       prow -= prstart; /* rm */
2239       pnzj = pi_loc[prow+1] - pi_loc[prow];
2240       pjj  = pj_loc + pi_loc[prow];
2241       /* add non-zero cols of P into the sorted linked list lnk */
2242       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
2243       cnzi += nlnk;
2244     }
2245     /* rows of P_oth */
2246     ptaj = ptasparserow_oth;
2247     for (j=0; j<nprow_oth; j++) {
2248       prow = *ptaj++;
2249       if (prow >= prend) prow -= pm; /* rm */
2250       pnzj = pi_oth[prow+1] - pi_oth[prow];
2251       pjj  = pj_oth + pi_oth[prow];
2252       /* add non-zero cols of P into the sorted linked list lnk */
2253       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
2254       cnzi += nlnk;
2255     }
2256 
2257     /* If free space is not available, make more free space */
2258     /* Double the amount of total space in the list */
2259     if (current_space->local_remaining<cnzi) {
2260       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
2261     }
2262 
2263     /* Copy data into free space, and zero out denserows */
2264     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
2265     current_space->array           += cnzi;
2266     current_space->local_used      += cnzi;
2267     current_space->local_remaining -= cnzi;
2268 
2269     for (j=0;j<nprow_loc; j++) {
2270       ptadenserow_loc[ptasparserow_loc[j]] = 0;
2271     }
2272     for (j=0;j<nprow_oth; j++) {
2273       ptadenserow_oth[ptasparserow_oth[j]] = 0;
2274     }
2275 
2276     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
2277     /*        For now, we will recompute what is needed. */
2278     ci[i+1] = ci[i] + cnzi;
2279   }
2280   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
2281   /* Allocate space for cj, initialize cj, and */
2282   /* destroy list of free space and other temporary array(s) */
2283   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
2284   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
2285   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
2286   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2287 
2288   /* Allocate space for ca */
2289   ierr = PetscMalloc((ci[pN]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
2290   ierr = PetscMemzero(ca,(ci[pN]+1)*sizeof(MatScalar));CHKERRQ(ierr);
2291 
2292   /* put together the new matrix */
2293   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,pN,pN,ci,cj,ca,C);CHKERRQ(ierr);
2294 
2295   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
2296   /* Since these are PETSc arrays, change flags to free them as necessary. */
2297   c = (Mat_SeqAIJ *)((*C)->data);
2298   c->freedata = PETSC_TRUE;
2299   c->nonew    = 0;
2300 
2301   /* Clean up. */
2302   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
2303 
2304   ierr = PetscLogEventEnd(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
2305   PetscFunctionReturn(0);
2306 }
2307