xref: /petsc/src/mat/impls/aij/seq/matptap.c (revision 7c334f0243eb3f00fc22f2f6efb0b24b1906cf86)
1 /*
2   Defines projective product routines where A is a AIJ matrix
3           C = P^T * A * P
4 */
5 
6 #include "src/mat/impls/aij/seq/aij.h"   /*I "petscmat.h" I*/
7 #include "src/mat/utils/freespace.h"
8 #include "src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatPtAP"
13 /*@
14    MatPtAP - Creates the matrix projection C = P^T * A * P
15 
16    Collective on Mat
17 
18    Input Parameters:
19 +  A - the matrix
20 .  P - the projection matrix
21 .  scall - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX
22 -  fill - expected fill as ratio of nnz(C)/(nnz(A) + nnz(P))
23 
24    Output Parameters:
25 .  C - the product matrix
26 
27    Notes:
28    C will be created and must be destroyed by the user with MatDestroy().
29 
30    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
31    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.
32 
33    Level: intermediate
34 
35 .seealso: MatPtAPSymbolic(),MatPtAPNumeric(),MatMatMult()
36 @*/
37 PetscErrorCode MatPtAP(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
38 {
39   PetscErrorCode ierr;
40   PetscErrorCode (*fA)(Mat,Mat,MatReuse,PetscReal,Mat *);
41   PetscErrorCode (*fP)(Mat,Mat,MatReuse,PetscReal,Mat *);
42 
43   PetscFunctionBegin;
44   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
45   PetscValidType(A,1);
46   MatPreallocated(A);
47   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
48   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
49   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
50   PetscValidType(P,2);
51   MatPreallocated(P);
52   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
53   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
54   PetscValidPointer(C,3);
55 
56   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
57 
58   if (fill <=0.0) SETERRQ1(PETSC_ERR_ARG_SIZ,"fill=%g must be > 0.0",fill);
59 
60   /* For now, we do not dispatch based on the type of A and P */
61   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
62   fA = A->ops->ptap;
63   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for A of type %s",A->type_name);
64   fP = P->ops->ptap;
65   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAP not supported for P of type %s",P->type_name);
66   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAP requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
67 
68   ierr = PetscLogEventBegin(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
69   ierr = (*fA)(A,P,scall,fill,C);CHKERRQ(ierr);
70   ierr = PetscLogEventEnd(MAT_PtAP,A,P,0,0);CHKERRQ(ierr);
71   PetscFunctionReturn(0);
72 }
73 
74 EXTERN PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,PetscReal,Mat*);
75 EXTERN PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat,Mat,Mat,PetscInt,Mat C);
76 
77 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
78 #undef __FUNCT__
79 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
80 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
81 {
82   PetscErrorCode       ierr;
83   Mat_MatMatMultMPI    *ptap;
84   PetscObjectContainer container;
85 
86   PetscFunctionBegin;
87   ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
88   if (container) {
89     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
90     ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr);
91     ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr);
92     ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr);
93     ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr);
94     if (ptap->abi) ierr = PetscFree(ptap->abi);CHKERRQ(ierr);
95     if (ptap->abj) ierr = PetscFree(ptap->abj);CHKERRQ(ierr);
96 
97     ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
98     ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr);
99   }
100   ierr = PetscFree(ptap);CHKERRQ(ierr);
101 
102   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
103   PetscFunctionReturn(0);
104 }
105 
106 #undef __FUNCT__
107 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
108 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
109 {
110   PetscErrorCode       ierr;
111   Mat_MatMatMultMPI    *ptap;
112   PetscObjectContainer container;
113 
114   PetscFunctionBegin;
115   if (scall == MAT_INITIAL_MATRIX){
116     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
117     ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr);
118     ptap->abi=PETSC_NULL; ptap->abj=PETSC_NULL;
119     ptap->abnz_max = 0; /* symbolic A*P is not done yet */
120 
121     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
122     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
123 
124     /* get P_loc by taking all local rows of P */
125     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
126 
127     /* attach the supporting struct to P for reuse */
128     P->ops->destroy  = MatDestroy_MPIAIJ_PtAP;
129     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
130     ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr);
131     ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr);
132 
133     /* now, compute symbolic local P^T*A*P */
134     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */
135     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
136   } else if (scall == MAT_REUSE_MATRIX){
137     ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
138     if (container) {
139       ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
140     } else {
141       SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
142     }
143 
144     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
145     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
146 
147     /* get P_loc by taking all local rows of P */
148     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
149 
150   } else {
151     SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall);
152   }
153   /* now, compute numeric local P^T*A*P */
154   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
155   ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
156   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
157 
158   PetscFunctionReturn(0);
159 }
160 
161 #undef __FUNCT__
162 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
163 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
164 {
165   PetscErrorCode       ierr;
166   Mat                  P_loc,P_oth;
167   Mat_MatMatMultMPI    *ptap;
168   PetscObjectContainer container;
169   FreeSpaceList        free_space=PETSC_NULL,current_space=PETSC_NULL;
170   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data;
171   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
172   Mat_SeqAIJ           *p_loc,*p_oth;
173   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*ajj,*pjj;
174   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
175   PetscInt             pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji,*ptaj;
176   PetscInt             aN=A->N,am=A->m,pN=P->N;
177   PetscInt             i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi;
178   PetscBT              lnkbt;
179   PetscInt             prstart,prend,nprow_loc,nprow_oth;
180   PetscInt             *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
181 
182   MPI_Comm             comm=A->comm;
183   Mat                  B_mpi;
184   PetscMPIInt          size,rank,tagi,tagj,*len_s,*len_si,*len_ri;
185   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
186   PetscInt             len,proc,*dnz,*onz,*owners;
187   PetscInt             anzi,*bi,*bj,bnzi,nspacedouble=0;
188   PetscInt             nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
189   MPI_Request          *si_waits,*sj_waits,*ri_waits,*rj_waits;
190   MPI_Status           *status;
191   Mat_Merge_SeqsToMPI  *merge;
192 
193   PetscFunctionBegin;
194   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
195   if (container) {
196     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
197   } else {
198     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
199   }
200   /* compute symbolic C_seq = P_loc^T * A_loc * P_seq */
201   /*--------------------------------------------------*/
202   /* get data from P_loc and P_oth */
203   P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart;
204   p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
205   pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
206   prend = prstart + pm;
207 
208   /* get ij structure of P_loc^T */
209   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
210   ptJ=ptj;
211 
212   /* Allocate ci array, arrays for fill computation and */
213   /* free space for accumulating nonzero column info */
214   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
215   ci[0] = 0;
216 
217   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
218   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
219   ptasparserow_loc = ptadenserow_loc + aN;
220   ptadenserow_oth  = ptasparserow_loc + aN;
221   ptasparserow_oth = ptadenserow_oth + aN;
222 
223   /* create and initialize a linked list */
224   nlnk = pN+1;
225   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
226 
227   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
228   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
229   nnz           = adi[am] + aoi[am];
230   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
231   current_space = free_space;
232 
233   /* determine symbolic info for each row of C: */
234   for (i=0;i<pN;i++) {
235     ptnzi  = pti[i+1] - pti[i];
236     nprow_loc = 0; nprow_oth = 0;
237     /* i-th row of symbolic P_loc^T*A_loc: */
238     for (j=0;j<ptnzi;j++) {
239       arow = *ptJ++;
240       /* diagonal portion of A */
241       anzj = adi[arow+1] - adi[arow];
242       ajj  = adj + adi[arow];
243       for (k=0;k<anzj;k++) {
244         col = ajj[k]+prstart;
245         if (!ptadenserow_loc[col]) {
246           ptadenserow_loc[col]    = -1;
247           ptasparserow_loc[nprow_loc++] = col;
248         }
249       }
250       /* off-diagonal portion of A */
251       anzj = aoi[arow+1] - aoi[arow];
252       ajj  = aoj + aoi[arow];
253       for (k=0;k<anzj;k++) {
254         col = a->garray[ajj[k]];  /* global col */
255         if (col < cstart){
256           col = ajj[k];
257         } else if (col >= cend){
258           col = ajj[k] + pm;
259         } else {
260           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
261         }
262         if (!ptadenserow_oth[col]) {
263           ptadenserow_oth[col]    = -1;
264           ptasparserow_oth[nprow_oth++] = col;
265         }
266       }
267     }
268 
269     /* using symbolic info of local PtA, determine symbolic info for row of C: */
270     cnzi   = 0;
271     /* rows of P_loc */
272     ptaj = ptasparserow_loc;
273     for (j=0; j<nprow_loc; j++) {
274       prow = *ptaj++;
275       prow -= prstart; /* rm */
276       pnzj = pi_loc[prow+1] - pi_loc[prow];
277       pjj  = pj_loc + pi_loc[prow];
278       /* add non-zero cols of P into the sorted linked list lnk */
279       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
280       cnzi += nlnk;
281     }
282     /* rows of P_oth */
283     ptaj = ptasparserow_oth;
284     for (j=0; j<nprow_oth; j++) {
285       prow = *ptaj++;
286       if (prow >= prend) prow -= pm; /* rm */
287       pnzj = pi_oth[prow+1] - pi_oth[prow];
288       pjj  = pj_oth + pi_oth[prow];
289       /* add non-zero cols of P into the sorted linked list lnk */
290       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
291       cnzi += nlnk;
292     }
293 
294     /* If free space is not available, make more free space */
295     /* Double the amount of total space in the list */
296     if (current_space->local_remaining<cnzi) {
297       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
298     }
299 
300     /* Copy data into free space, and zero out denserows */
301     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
302     current_space->array           += cnzi;
303     current_space->local_used      += cnzi;
304     current_space->local_remaining -= cnzi;
305 
306     for (j=0;j<nprow_loc; j++) {
307       ptadenserow_loc[ptasparserow_loc[j]] = 0;
308     }
309     for (j=0;j<nprow_oth; j++) {
310       ptadenserow_oth[ptasparserow_oth[j]] = 0;
311     }
312 
313     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
314     /*        For now, we will recompute what is needed. */
315     ci[i+1] = ci[i] + cnzi;
316   }
317   /* Clean up. */
318   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
319 
320   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
321   /* Allocate space for cj, initialize cj, and */
322   /* destroy list of free space and other temporary array(s) */
323   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
324   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
325   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
326   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
327 
328   /* Allocate space for ca */
329   /*
330   ierr = PetscMalloc((ci[pN]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
331   ierr = PetscMemzero(ca,(ci[pN]+1)*sizeof(MatScalar));CHKERRQ(ierr);
332   */
333 
334   /* add C_seq into mpi C              */
335   /*-----------------------------------*/
336   free_space=PETSC_NULL; current_space=PETSC_NULL;
337 
338   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
339   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
340 
341   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
342 
343 
344   /* determine row ownership */
345   /*---------------------------------------------------------*/
346   ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr);
347   ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr);
348   ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr);
349   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
350 
351   /* determine the number of messages to send, their lengths */
352   /*---------------------------------------------------------*/
353   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
354   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
355   len_s  = merge->len_s;
356   len = 0;  /* length of buf_si[] */
357   merge->nsend = 0;
358   for (proc=0; proc<size; proc++){
359     len_si[proc] = 0;
360     if (proc == rank){
361       len_s[proc] = 0;
362     } else {
363       len_si[proc] = owners[proc+1] - owners[proc] + 1;
364       len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of rows to be sent to [proc] */
365     }
366     if (len_s[proc]) {
367       merge->nsend++;
368       nrows = 0;
369       for (i=owners[proc]; i<owners[proc+1]; i++){
370         if (ci[i+1] > ci[i]) nrows++;
371       }
372       len_si[proc] = 2*(nrows+1);
373       len += len_si[proc];
374     }
375   }
376 
377   /* determine the number and length of messages to receive for ij-structure */
378   /*-------------------------------------------------------------------------*/
379   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
380   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
381 
382   /* post the Irecv of j-structure */
383   /*-------------------------------*/
384   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr);
385   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr);
386 
387   /* post the Isend of j-structure */
388   /*--------------------------------*/
389   ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr);
390   sj_waits = si_waits + merge->nsend;
391 
392   for (proc=0, k=0; proc<size; proc++){
393     if (!len_s[proc]) continue;
394     i = owners[proc];
395     ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr);
396     k++;
397   }
398 
399   /* receives and sends of j-structure are complete */
400   /*------------------------------------------------*/
401   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
402   ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr);
403   ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr);
404 
405   /* send and recv i-structure */
406   /*---------------------------*/
407   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr);
408   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr);
409 
410   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
411   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
412   for (proc=0,k=0; proc<size; proc++){
413     if (!len_s[proc]) continue;
414     /* form outgoing message for i-structure:
415          buf_si[0]:                 nrows to be sent
416                [1:nrows]:           row index (global)
417                [nrows+1:2*nrows+1]: i-structure index
418     */
419     /*-------------------------------------------*/
420     nrows = len_si[proc]/2 - 1;
421     buf_si_i    = buf_si + nrows+1;
422     buf_si[0]   = nrows;
423     buf_si_i[0] = 0;
424     nrows = 0;
425     for (i=owners[proc]; i<owners[proc+1]; i++){
426       anzi = ci[i+1] - ci[i];
427       if (anzi) {
428         buf_si_i[nrows+1] = buf_si_i[nrows] + anzi; /* i-structure */
429         buf_si[nrows+1] = i-owners[proc]; /* local row index */
430         nrows++;
431       }
432     }
433     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr);
434     k++;
435     buf_si += len_si[proc];
436   }
437 
438   ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr);
439   ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr);
440 
441   ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr);
442   for (i=0; i<merge->nrecv; i++){
443     ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI:   recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr);
444   }
445 
446   ierr = PetscFree(len_si);CHKERRQ(ierr);
447   ierr = PetscFree(len_ri);CHKERRQ(ierr);
448   ierr = PetscFree(rj_waits);CHKERRQ(ierr);
449   ierr = PetscFree(si_waits);CHKERRQ(ierr);
450   ierr = PetscFree(ri_waits);CHKERRQ(ierr);
451   ierr = PetscFree(buf_s);CHKERRQ(ierr);
452   ierr = PetscFree(status);CHKERRQ(ierr);
453 
454   /* compute a local seq matrix in each processor */
455   /*----------------------------------------------*/
456   /* allocate bi array and free space for accumulating nonzero column info */
457   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
458   bi[0] = 0;
459 
460   /* create and initialize a linked list */
461   nlnk = pN+1;
462   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
463 
464   /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */
465   len = 0;
466   len  = ci[owners[rank+1]] - ci[owners[rank]];
467   ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr);
468   current_space = free_space;
469 
470   /* determine symbolic info for each local row */
471   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
472   nextrow = buf_ri_k + merge->nrecv;
473   nextci  = nextrow + merge->nrecv;
474   for (k=0; k<merge->nrecv; k++){
475     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
476     nrows = *buf_ri_k[k];
477     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
478     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
479   }
480 
481   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
482   len = 0;
483   for (i=0; i<pn; i++) {
484     bnzi   = 0;
485     /* add local non-zero cols of this proc's C_seq into lnk */
486     arow   = owners[rank] + i;
487     anzi   = ci[arow+1] - ci[arow];
488     cji    = cj + ci[arow];
489     ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
490     bnzi += nlnk;
491     /* add received col data into lnk */
492     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
493       if (i == *nextrow[k]) { /* i-th row */
494         anzi = *(nextci[k]+1) - *nextci[k];
495         cji  = buf_rj[k] + *nextci[k];
496         ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
497         bnzi += nlnk;
498         nextrow[k]++; nextci[k]++;
499       }
500     }
501     if (len < bnzi) len = bnzi;  /* =max(bnzi) */
502 
503     /* if free space is not available, make more free space */
504     if (current_space->local_remaining<bnzi) {
505       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
506       nspacedouble++;
507     }
508     /* copy data into free space, then initialize lnk */
509     ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
510     ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr);
511 
512     current_space->array           += bnzi;
513     current_space->local_used      += bnzi;
514     current_space->local_remaining -= bnzi;
515 
516     bi[i+1] = bi[i] + bnzi;
517   }
518 
519   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
520 
521   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
522   ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
523   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
524 
525   /* create symbolic parallel matrix B_mpi */
526   /*---------------------------------------*/
527   ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr);
528   ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr);
529   ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr);
530   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
531   /* ierr = MatSetOption(B_mpi,MAT_COLUMNS_SORTED);CHKERRQ(ierr); -cause delay? */
532 
533   /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */
534   B_mpi->assembled     = PETSC_FALSE;
535   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_SeqsToMPI;
536   merge->bi            = bi;
537   merge->bj            = bj;
538   merge->ci            = ci;
539   merge->cj            = cj;
540   merge->buf_ri        = buf_ri;
541   merge->buf_rj        = buf_rj;
542   merge->C_seq         = PETSC_NULL;
543 
544   /* attach the supporting struct to B_mpi for reuse */
545   ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
546   ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr);
547   ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr);
548   *C = B_mpi;
549 
550   PetscFunctionReturn(0);
551 }
552 
553 #undef __FUNCT__
554 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
555 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
556 {
557   PetscErrorCode       ierr;
558   Mat_Merge_SeqsToMPI  *merge;
559   Mat_MatMatMultMPI    *ptap;
560   PetscObjectContainer cont_merge,cont_ptap;
561 
562   PetscInt       flops=0;
563   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
564   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
565   Mat_SeqAIJ     *p_loc,*p_oth;
566   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col;
567   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pjj;
568   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
569   PetscInt       *cjj;
570   MatScalar      *ada=ad->a,*aoa=ao->a,*ada_tmp=ad->a,*aoa_tmp=ao->a,*apa_tmp,*paj,*cseqa,*caj; /**apa*/
571   MatScalar      *pa_loc,*pA,*pa_oth;
572   PetscInt       am=A->m,cN=C->N;
573   PetscInt       nextp,*adj_tmp=ad->j,*aoj_tmp=ao->j;
574 
575   MPI_Comm             comm=C->comm;
576   PetscMPIInt          size,rank,taga,*len_s;
577   PetscInt             *owners;
578   PetscInt             proc;
579   PetscInt             **buf_ri,**buf_rj;
580   PetscInt             cseqnzi,*bj_i,*bi,*bj,cseqrow,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */
581   PetscInt             nrows,**buf_ri_k,**nextrow,**nextcseqi;
582   MPI_Request          *s_waits,*r_waits;
583   MPI_Status           *status;
584   MatScalar            **abuf_r,*ba_i;
585   PetscInt             *cseqi,*cseqj;
586   PetscInt             *cseqj_tmp;
587   MatScalar            *cseqa_tmp;
588   PetscInt             stages[2];
589   PetscInt             *apsymi,*apsymj;
590   PetscTruth           HasSymAP;
591   FreeSpaceList        free_space=PETSC_NULL,current_space=PETSC_NULL;
592 
593   PetscFunctionBegin;
594   ierr = PetscLogStageRegister(&stages[0],"NumPtAP_local");CHKERRQ(ierr);
595   ierr = PetscLogStageRegister(&stages[1],"NumPtAP_Comm");CHKERRQ(ierr);
596 
597   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
598   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
599 
600   ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr);
601   if (cont_merge) {
602     ierr  = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr);
603   } else {
604     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
605   }
606   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr);
607   if (cont_ptap) {
608     ierr  = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr);
609   } else {
610     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
611   }
612   /* set or get data from symbolic A*P */
613   if (!ptap->abnz_max) {
614     HasSymAP = PETSC_FALSE;
615     ierr  = PetscMalloc((am+2)*sizeof(PetscInt),&apsymi);CHKERRQ(ierr);
616     ptap->abi = apsymi;
617     apsymi[0] = 0;
618     ierr = PetscMalloc(cN*sizeof(MatScalar),&apa_tmp);CHKERRQ(ierr);
619     ierr = PetscMemzero(apa_tmp,cN*sizeof(MatScalar));CHKERRQ(ierr);
620     /* Initial FreeSpace size is 2*nnz(A) */
621     ierr = GetMoreSpace((PetscInt)(2*(adi[am]+aoi[am])),&free_space);CHKERRQ(ierr);
622     current_space = free_space;
623   } else {
624     /* printf(" [%d] abnz_max: %d, cN: %d\n",rank,ptap->abnz_max,cN); */
625     HasSymAP = PETSC_TRUE;
626     ierr = PetscMalloc((ptap->abnz_max+1)*sizeof(MatScalar),&apa_tmp);CHKERRQ(ierr);
627     ierr = PetscMemzero(apa_tmp,ptap->abnz_max*sizeof(MatScalar));CHKERRQ(ierr);
628     apsymi = ptap->abi; apsymj = ptap->abj;
629   }
630 
631   /* get data from symbolic products */
632   p_loc=(Mat_SeqAIJ*)(ptap->B_loc)->data;
633   p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data;
634   pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a; pA=pa_loc;
635   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
636 
637   cseqi = merge->ci; cseqj=merge->cj;
638   ierr  = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr);
639   ierr  = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
640 
641   /* Allocate temporary array for storage of one row of A*P */
642   /* ierr = PetscMalloc(cN*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
643   ierr = PetscMemzero(apa,cN*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
644   apj      = (PetscInt *)(apa + cN); */
645 
646   ierr = PetscMalloc(cN*(2*sizeof(PetscInt)),&apj);CHKERRQ(ierr);
647   ierr = PetscMemzero(apj,cN*(2*sizeof(PetscInt)));CHKERRQ(ierr);
648   /* apj      = (PetscInt *)(apa + cN); */
649   apjdense = apj + cN;
650   /*
651   ierr = PetscMalloc(cN*sizeof(MatScalar),&apa_tmp);CHKERRQ(ierr);
652   ierr = PetscMemzero(apa_tmp,cN*sizeof(MatScalar));CHKERRQ(ierr);
653   */
654   /* Clear old values in C_Seq */
655   ierr = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
656 
657   ierr = PetscLogStagePush(stages[0]);CHKERRQ(ierr);
658   for (i=0;i<am;i++) {
659     /* Form i-th sparse row of A*P */
660     if (!HasSymAP){ /* Compute symbolic i-th sparse row of A*P */
661       apnzj = 0;
662       /* diagonal portion of A */
663       anzi  = adi[i+1] - adi[i];
664       for (j=0;j<anzi;j++) {
665         prow = *adj;
666         adj++;
667         pnzj = pi_loc[prow+1] - pi_loc[prow];
668         pjj  = pj_loc + pi_loc[prow];
669         paj  = pa_loc + pi_loc[prow];
670         for (k=0;k<pnzj;k++) {
671           if (!apjdense[pjj[k]]) {
672             apjdense[pjj[k]] = -1;
673             apj[apnzj++]     = pjj[k];
674           }
675           /* apa[pjj[k]] += (*ada)*paj[k]; */
676         }
677         flops += 2*pnzj;
678         ada++;
679       }
680       /* off-diagonal portion of A */
681       anzi  = aoi[i+1] - aoi[i];
682       for (j=0;j<anzi;j++) {
683         col = a->garray[*aoj];
684         if (col < cstart){
685           prow = *aoj;
686         } else if (col >= cend){
687           prow = *aoj;
688         } else {
689           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
690         }
691         aoj++;
692         pnzj = pi_oth[prow+1] - pi_oth[prow];
693         pjj  = pj_oth + pi_oth[prow];
694         paj  = pa_oth + pi_oth[prow];
695         for (k=0;k<pnzj;k++) {
696           if (!apjdense[pjj[k]]) {
697             apjdense[pjj[k]] = -1;
698             apj[apnzj++]     = pjj[k];
699           }
700           /* apa[pjj[k]] += (*aoa)*paj[k]; */
701         }
702         flops += 2*pnzj;
703         aoa++;
704       }
705       /* Sort the j index array for quick sparse axpy. */
706       ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
707     } else { /* get symbolic AP[i,:] */
708       apnzj = apsymi[i+1] - apsymi[i];
709       for (j=0; j<apnzj; j++){
710         apj[j] = *(apsymj + apsymi[i]+j);
711       }
712     }
713     if (!HasSymAP){
714       apsymi[i+1] = apsymi[i] + apnzj;
715       if (ptap->abnz_max < apnzj) ptap->abnz_max = apnzj;
716 
717       /* If free space is not available, make more free space */
718       /* Double the amount of total space in the list */
719       if (current_space->local_remaining<apnzj) {
720         ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
721       }
722 
723       /* Copy data into free space, then initialize lnk */
724       /* ierr = PetscLLClean(bn,bn,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr); */
725       for (j=0; j<apnzj; j++) current_space->array[j] = apj[j];
726       current_space->array           += apnzj;
727       current_space->local_used      += apnzj;
728       current_space->local_remaining -= apnzj;
729     }
730 
731     /* use symbolic AP[i,:] to form a condensed array apa_tmp */
732     /* diagonal portion of A */
733     anzi  = adi[i+1] - adi[i];
734     for (j=0;j<anzi;j++) {
735       prow = *adj_tmp;
736       adj_tmp++;
737       pnzj = pi_loc[prow+1] - pi_loc[prow];
738       pjj  = pj_loc + pi_loc[prow];
739       paj  = pa_loc + pi_loc[prow];
740       nextp = 0;
741       for (k=0; nextp<pnzj; k++) {
742         if (apj[k] == pjj[nextp]) { /* col of AP == col of P */
743           apa_tmp[k] += (*ada_tmp)*paj[nextp++];
744         }
745       }
746       ada_tmp++;
747     }
748     /* off-diagonal portion of A */
749     anzi  = aoi[i+1] - aoi[i];
750     for (j=0;j<anzi;j++) {
751       col = a->garray[*aoj_tmp];
752       if (col < cstart){
753         prow = *aoj_tmp;
754       } else if (col >= cend){
755         prow = *aoj_tmp;
756       } else {
757         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
758       }
759       aoj_tmp++;
760       pnzj = pi_oth[prow+1] - pi_oth[prow];
761       pjj  = pj_oth + pi_oth[prow];
762       paj  = pa_oth + pi_oth[prow];
763       nextp = 0;
764       for (k=0; nextp<pnzj; k++) {
765         if (apj[k] == pjj[nextp]) { /* col of AP == col of P */
766           apa_tmp[k] += (*aoa_tmp)*paj[nextp++];
767         }
768       }
769       flops += 2*pnzj;
770       aoa_tmp++;
771     }
772 
773     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
774     pnzi = pi_loc[i+1] - pi_loc[i];
775     for (j=0;j<pnzi;j++) {
776       nextap = 0;
777       crow   = *pJ++;
778       cjj    = cseqj + cseqi[crow];
779       caj    = cseqa + cseqi[crow];
780       /* Perform sparse axpy operation.  Note cjj includes apj. */
781       for (k=0;nextap<apnzj;k++) {
782         if (cjj[k]==apj[nextap]) {
783           /* caj[k] += (*pA)*apa[apj[nextap++]]; */
784           caj[k] += (*pA)*apa_tmp[nextap++];
785         }
786       }
787 
788       flops += 2*apnzj;
789       pA++;
790     }
791 
792     /* Zero the current row info for A*P */
793     for (j=0;j<apnzj;j++) {
794       /*      apa[apj[j]]      = 0.; */
795       apjdense[apj[j]] = 0;
796       apa_tmp[j] = 0;
797     }
798   }
799   if (!HasSymAP){
800   /* Column indices of AP are in the list of free space */
801   /* Allocate space for apsymj, initialize apsymj, and */
802   /* destroy list of free space and other temporary array(s) */
803     ierr = PetscMalloc((apsymi[am]+1)*sizeof(PetscInt),&ptap->abj);CHKERRQ(ierr);
804     ierr = MakeSpaceContiguous(&free_space,ptap->abj);CHKERRQ(ierr);
805   }
806   ierr = PetscFree(apa_tmp);CHKERRQ(ierr);
807   ierr = PetscFree(apj);CHKERRQ(ierr);
808   ierr = PetscLogStagePop();
809 
810   bi     = merge->bi;
811   bj     = merge->bj;
812   buf_ri = merge->buf_ri;
813   buf_rj = merge->buf_rj;
814 
815   ierr   = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
816   ierr   = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
817 
818   /* send and recv matrix values */
819   /*-----------------------------*/
820   ierr = PetscLogStagePush(stages[1]);CHKERRQ(ierr);
821   len_s  = merge->len_s;
822   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr);
823   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
824 
825   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr);
826   for (proc=0,k=0; proc<size; proc++){
827     if (!len_s[proc]) continue;
828     i = owners[proc];
829     ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
830     k++;
831   }
832 
833   ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);
834   ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);
835   ierr = PetscFree(status);CHKERRQ(ierr);
836 
837   ierr = PetscFree(s_waits);CHKERRQ(ierr);
838   ierr = PetscFree(r_waits);CHKERRQ(ierr);
839 
840   /* insert mat values of mpimat */
841   /*----------------------------*/
842   ierr = PetscMalloc(cN*sizeof(MatScalar),&ba_i);CHKERRQ(ierr);
843   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
844   nextrow = buf_ri_k + merge->nrecv;
845   nextcseqi  = nextrow + merge->nrecv;
846 
847   for (k=0; k<merge->nrecv; k++){
848     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
849     nrows = *(buf_ri_k[k]);
850     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
851     nextcseqi[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
852   }
853 
854   /* set values of ba */
855   for (i=0; i<C->m; i++) {
856     cseqrow = owners[rank] + i; /* global row index of C_seq */
857     bj_i = bj+bi[i];  /* col indices of the i-th row of C */
858     bnzi = bi[i+1] - bi[i];
859     ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr);
860 
861     /* add local non-zero vals of this proc's C_seq into ba */
862     cseqnzi = cseqi[cseqrow+1] - cseqi[cseqrow];
863     cseqj_tmp = cseqj + cseqi[cseqrow];
864     cseqa_tmp = cseqa + cseqi[cseqrow];
865     nextcseqj = 0;
866     for (j=0; nextcseqj<cseqnzi; j++){
867       if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
868         ba_i[j] += cseqa_tmp[nextcseqj++];
869       }
870     }
871 
872     /* add received vals into ba */
873     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
874       /* i-th row */
875       if (i == *nextrow[k]) {
876         cseqnzi   = *(nextcseqi[k]+1) - *nextcseqi[k];
877         cseqj_tmp = buf_rj[k] + *(nextcseqi[k]);
878         cseqa_tmp = abuf_r[k] + *(nextcseqi[k]);
879         nextcseqj = 0;
880         for (j=0; nextcseqj<cseqnzi; j++){
881           if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
882             ba_i[j] += cseqa_tmp[nextcseqj++];
883           }
884         }
885         nextrow[k]++; nextcseqi[k]++;
886       }
887     }
888     ierr = MatSetValues(C,1,&cseqrow,bnzi,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
889     flops += 2*bnzi;
890   }
891   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
892   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
893   ierr = PetscLogStagePop();CHKERRQ(ierr);
894 
895   ierr = PetscFree(cseqa);CHKERRQ(ierr);
896   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
897   ierr = PetscFree(ba_i);CHKERRQ(ierr);
898   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
899   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
900 
901   PetscFunctionReturn(0);
902 }
903 
904 #undef __FUNCT__
905 #define __FUNCT__ "MatPtAP_SeqAIJ_SeqAIJ"
906 PetscErrorCode MatPtAP_SeqAIJ_SeqAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
907 {
908   PetscErrorCode ierr;
909 
910   PetscFunctionBegin;
911   if (scall == MAT_INITIAL_MATRIX){
912     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
913     ierr = MatPtAPSymbolic_SeqAIJ_SeqAIJ(A,P,fill,C);CHKERRQ(ierr);
914     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
915   }
916   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
917   ierr = MatPtAPNumeric_SeqAIJ_SeqAIJ(A,P,*C);CHKERRQ(ierr);
918   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
919   PetscFunctionReturn(0);
920 }
921 
922 /*
923    MatPtAPSymbolic - Creates the (i,j) structure of the matrix projection C = P^T * A * P
924 
925    Collective on Mat
926 
927    Input Parameters:
928 +  A - the matrix
929 -  P - the projection matrix
930 
931    Output Parameters:
932 .  C - the (i,j) structure of the product matrix
933 
934    Notes:
935    C will be created and must be destroyed by the user with MatDestroy().
936 
937    This routine is currently only implemented for pairs of SeqAIJ matrices and classes
938    which inherit from SeqAIJ.  C will be of type MATSEQAIJ.  The product is computed using
939    this (i,j) structure by calling MatPtAPNumeric().
940 
941    Level: intermediate
942 
943 .seealso: MatPtAP(),MatPtAPNumeric(),MatMatMultSymbolic()
944 */
945 #undef __FUNCT__
946 #define __FUNCT__ "MatPtAPSymbolic"
947 PetscErrorCode MatPtAPSymbolic(Mat A,Mat P,PetscReal fill,Mat *C)
948 {
949   PetscErrorCode ierr;
950   PetscErrorCode (*fA)(Mat,Mat,PetscReal,Mat*);
951   PetscErrorCode (*fP)(Mat,Mat,PetscReal,Mat*);
952 
953   PetscFunctionBegin;
954 
955   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
956   PetscValidType(A,1);
957   MatPreallocated(A);
958   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
959   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
960 
961   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
962   PetscValidType(P,2);
963   MatPreallocated(P);
964   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
965   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
966 
967   PetscValidPointer(C,3);
968 
969   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
970   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
971 
972   /* For now, we do not dispatch based on the type of A and P */
973   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
974   fA = A->ops->ptapsymbolic;
975   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for A of type %s",A->type_name);
976   fP = P->ops->ptapsymbolic;
977   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPSymbolic not supported for P of type %s",P->type_name);
978   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPSymbolic requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
979 
980   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
981   ierr = (*fA)(A,P,fill,C);CHKERRQ(ierr);
982   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
983 
984   PetscFunctionReturn(0);
985 }
986 #ifdef TOBEREMOVE
987 typedef struct {
988   Mat    symAP;
989 } Mat_PtAPstruct;
990 
991 EXTERN PetscErrorCode MatDestroy_SeqAIJ(Mat);
992 
993 #undef __FUNCT__
994 #define __FUNCT__ "MatDestroy_SeqAIJ_PtAP"
995 PetscErrorCode MatDestroy_SeqAIJ_PtAP(Mat A)
996 {
997   PetscErrorCode    ierr;
998   Mat_PtAPstruct    *ptap=(Mat_PtAPstruct*)A->spptr;
999 
1000   PetscFunctionBegin;
1001   ierr = MatDestroy(ptap->symAP);CHKERRQ(ierr);
1002   ierr = PetscFree(ptap);CHKERRQ(ierr);
1003   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
1004   PetscFunctionReturn(0);
1005 }
1006 #endif /* TOBEREMOVE */
1007 
1008 #undef __FUNCT__
1009 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqAIJ"
1010 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
1011 {
1012   PetscErrorCode ierr;
1013   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1014   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data,*p = (Mat_SeqAIJ*)P->data,*c;
1015   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
1016   PetscInt       *ci,*cj,*ptadenserow,*ptasparserow,*ptaj;
1017   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M;
1018   PetscInt       i,j,k,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi,nlnk,*lnk;
1019   MatScalar      *ca;
1020   PetscBT        lnkbt;
1021 
1022   PetscFunctionBegin;
1023   /* Get ij structure of P^T */
1024   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1025   ptJ=ptj;
1026 
1027   /* Allocate ci array, arrays for fill computation and */
1028   /* free space for accumulating nonzero column info */
1029   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1030   ci[0] = 0;
1031 
1032   ierr = PetscMalloc((2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
1033   ierr = PetscMemzero(ptadenserow,(2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
1034   ptasparserow = ptadenserow  + an;
1035 
1036   /* create and initialize a linked list */
1037   nlnk = pn+1;
1038   ierr = PetscLLCreate(pn,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1039 
1040   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
1041   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1042   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
1043   current_space = free_space;
1044 
1045   /* Determine symbolic info for each row of C: */
1046   for (i=0;i<pn;i++) {
1047     ptnzi  = pti[i+1] - pti[i];
1048     ptanzi = 0;
1049     /* Determine symbolic row of PtA: */
1050     for (j=0;j<ptnzi;j++) {
1051       arow = *ptJ++;
1052       anzj = ai[arow+1] - ai[arow];
1053       ajj  = aj + ai[arow];
1054       for (k=0;k<anzj;k++) {
1055         if (!ptadenserow[ajj[k]]) {
1056           ptadenserow[ajj[k]]    = -1;
1057           ptasparserow[ptanzi++] = ajj[k];
1058         }
1059       }
1060     }
1061       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
1062     ptaj = ptasparserow;
1063     cnzi   = 0;
1064     for (j=0;j<ptanzi;j++) {
1065       prow = *ptaj++;
1066       pnzj = pi[prow+1] - pi[prow];
1067       pjj  = pj + pi[prow];
1068       /* add non-zero cols of P into the sorted linked list lnk */
1069       ierr = PetscLLAdd(pnzj,pjj,pn,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1070       cnzi += nlnk;
1071     }
1072 
1073     /* If free space is not available, make more free space */
1074     /* Double the amount of total space in the list */
1075     if (current_space->local_remaining<cnzi) {
1076       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1077     }
1078 
1079     /* Copy data into free space, and zero out denserows */
1080     ierr = PetscLLClean(pn,pn,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1081     current_space->array           += cnzi;
1082     current_space->local_used      += cnzi;
1083     current_space->local_remaining -= cnzi;
1084 
1085     for (j=0;j<ptanzi;j++) {
1086       ptadenserow[ptasparserow[j]] = 0;
1087     }
1088     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1089     /*        For now, we will recompute what is needed. */
1090     ci[i+1] = ci[i] + cnzi;
1091   }
1092   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1093   /* Allocate space for cj, initialize cj, and */
1094   /* destroy list of free space and other temporary array(s) */
1095   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1096   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1097   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
1098   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1099 
1100   /* Allocate space for ca */
1101   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1102   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1103 
1104   /* put together the new matrix */
1105   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
1106 
1107   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1108   /* Since these are PETSc arrays, change flags to free them as necessary. */
1109   c = (Mat_SeqAIJ *)((*C)->data);
1110   c->freedata = PETSC_TRUE;
1111   c->nonew    = 0;
1112 
1113   /* Clean up. */
1114   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1115 
1116   PetscFunctionReturn(0);
1117 }
1118 
1119 #include "src/mat/impls/maij/maij.h"
1120 EXTERN_C_BEGIN
1121 #undef __FUNCT__
1122 #define __FUNCT__ "MatPtAPSymbolic_SeqAIJ_SeqMAIJ"
1123 PetscErrorCode MatPtAPSymbolic_SeqAIJ_SeqMAIJ(Mat A,Mat PP,Mat *C)
1124 {
1125   /* This routine requires testing -- I don't think it works. */
1126   PetscErrorCode ierr;
1127   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1128   Mat_SeqMAIJ    *pp=(Mat_SeqMAIJ*)PP->data;
1129   Mat            P=pp->AIJ;
1130   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*p=(Mat_SeqAIJ*)P->data,*c;
1131   PetscInt       *pti,*ptj,*ptJ,*ai=a->i,*aj=a->j,*ajj,*pi=p->i,*pj=p->j,*pjj;
1132   PetscInt       *ci,*cj,*denserow,*sparserow,*ptadenserow,*ptasparserow,*ptaj;
1133   PetscInt       an=A->N,am=A->M,pn=P->N,pm=P->M,ppdof=pp->dof;
1134   PetscInt       i,j,k,dof,pdof,ptnzi,arow,anzj,ptanzi,prow,pnzj,cnzi;
1135   MatScalar      *ca;
1136 
1137   PetscFunctionBegin;
1138   /* Start timer */
1139   ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1140 
1141   /* Get ij structure of P^T */
1142   ierr = MatGetSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1143 
1144   /* Allocate ci array, arrays for fill computation and */
1145   /* free space for accumulating nonzero column info */
1146   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1147   ci[0] = 0;
1148 
1149   ierr = PetscMalloc((2*pn+2*an+1)*sizeof(PetscInt),&ptadenserow);CHKERRQ(ierr);
1150   ierr = PetscMemzero(ptadenserow,(2*pn+2*an+1)*sizeof(PetscInt));CHKERRQ(ierr);
1151   ptasparserow = ptadenserow  + an;
1152   denserow     = ptasparserow + an;
1153   sparserow    = denserow     + pn;
1154 
1155   /* Set initial free space to be nnz(A) scaled by aspect ratio of P. */
1156   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1157   ierr          = GetMoreSpace((ai[am]/pm)*pn,&free_space);
1158   current_space = free_space;
1159 
1160   /* Determine symbolic info for each row of C: */
1161   for (i=0;i<pn/ppdof;i++) {
1162     ptnzi  = pti[i+1] - pti[i];
1163     ptanzi = 0;
1164     ptJ    = ptj + pti[i];
1165     for (dof=0;dof<ppdof;dof++) {
1166     /* Determine symbolic row of PtA: */
1167       for (j=0;j<ptnzi;j++) {
1168         arow = ptJ[j] + dof;
1169         anzj = ai[arow+1] - ai[arow];
1170         ajj  = aj + ai[arow];
1171         for (k=0;k<anzj;k++) {
1172           if (!ptadenserow[ajj[k]]) {
1173             ptadenserow[ajj[k]]    = -1;
1174             ptasparserow[ptanzi++] = ajj[k];
1175           }
1176         }
1177       }
1178       /* Using symbolic info for row of PtA, determine symbolic info for row of C: */
1179       ptaj = ptasparserow;
1180       cnzi   = 0;
1181       for (j=0;j<ptanzi;j++) {
1182         pdof = *ptaj%dof;
1183         prow = (*ptaj++)/dof;
1184         pnzj = pi[prow+1] - pi[prow];
1185         pjj  = pj + pi[prow];
1186         for (k=0;k<pnzj;k++) {
1187           if (!denserow[pjj[k]+pdof]) {
1188             denserow[pjj[k]+pdof] = -1;
1189             sparserow[cnzi++]     = pjj[k]+pdof;
1190           }
1191         }
1192       }
1193 
1194       /* sort sparserow */
1195       ierr = PetscSortInt(cnzi,sparserow);CHKERRQ(ierr);
1196 
1197       /* If free space is not available, make more free space */
1198       /* Double the amount of total space in the list */
1199       if (current_space->local_remaining<cnzi) {
1200         ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1201       }
1202 
1203       /* Copy data into free space, and zero out denserows */
1204       ierr = PetscMemcpy(current_space->array,sparserow,cnzi*sizeof(PetscInt));CHKERRQ(ierr);
1205       current_space->array           += cnzi;
1206       current_space->local_used      += cnzi;
1207       current_space->local_remaining -= cnzi;
1208 
1209       for (j=0;j<ptanzi;j++) {
1210         ptadenserow[ptasparserow[j]] = 0;
1211       }
1212       for (j=0;j<cnzi;j++) {
1213         denserow[sparserow[j]] = 0;
1214       }
1215       /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1216       /*        For now, we will recompute what is needed. */
1217       ci[i+1+dof] = ci[i+dof] + cnzi;
1218     }
1219   }
1220   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1221   /* Allocate space for cj, initialize cj, and */
1222   /* destroy list of free space and other temporary array(s) */
1223   ierr = PetscMalloc((ci[pn]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1224   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1225   ierr = PetscFree(ptadenserow);CHKERRQ(ierr);
1226 
1227   /* Allocate space for ca */
1228   ierr = PetscMalloc((ci[pn]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1229   ierr = PetscMemzero(ca,(ci[pn]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1230 
1231   /* put together the new matrix */
1232   ierr = MatCreateSeqAIJWithArrays(A->comm,pn,pn,ci,cj,ca,C);CHKERRQ(ierr);
1233 
1234   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1235   /* Since these are PETSc arrays, change flags to free them as necessary. */
1236   c = (Mat_SeqAIJ *)((*C)->data);
1237   c->freedata = PETSC_TRUE;
1238   c->nonew    = 0;
1239 
1240   /* Clean up. */
1241   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P,&pti,&ptj);CHKERRQ(ierr);
1242 
1243   ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,PP,0,0);CHKERRQ(ierr);
1244   PetscFunctionReturn(0);
1245 }
1246 EXTERN_C_END
1247 
1248 /*
1249    MatPtAPNumeric - Computes the matrix projection C = P^T * A * P
1250 
1251    Collective on Mat
1252 
1253    Input Parameters:
1254 +  A - the matrix
1255 -  P - the projection matrix
1256 
1257    Output Parameters:
1258 .  C - the product matrix
1259 
1260    Notes:
1261    C must have been created by calling MatPtAPSymbolic and must be destroyed by
1262    the user using MatDeatroy().
1263 
1264    This routine is currently only implemented for pairs of AIJ matrices and classes
1265    which inherit from AIJ.  C will be of type MATAIJ.
1266 
1267    Level: intermediate
1268 
1269 .seealso: MatPtAP(),MatPtAPSymbolic(),MatMatMultNumeric()
1270 */
1271 #undef __FUNCT__
1272 #define __FUNCT__ "MatPtAPNumeric"
1273 PetscErrorCode MatPtAPNumeric(Mat A,Mat P,Mat C)
1274 {
1275   PetscErrorCode ierr;
1276   PetscErrorCode (*fA)(Mat,Mat,Mat);
1277   PetscErrorCode (*fP)(Mat,Mat,Mat);
1278 
1279   PetscFunctionBegin;
1280 
1281   PetscValidHeaderSpecific(A,MAT_COOKIE,1);
1282   PetscValidType(A,1);
1283   MatPreallocated(A);
1284   if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1285   if (A->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1286 
1287   PetscValidHeaderSpecific(P,MAT_COOKIE,2);
1288   PetscValidType(P,2);
1289   MatPreallocated(P);
1290   if (!P->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1291   if (P->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1292 
1293   PetscValidHeaderSpecific(C,MAT_COOKIE,3);
1294   PetscValidType(C,3);
1295   MatPreallocated(C);
1296   if (!C->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1297   if (C->factor) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1298 
1299   if (P->N!=C->M) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->M);
1300   if (P->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->M,A->N);
1301   if (A->M!=A->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix 'A' must be square, %D != %D",A->M,A->N);
1302   if (P->N!=C->N) SETERRQ2(PETSC_ERR_ARG_SIZ,"Matrix dimensions are incompatible, %D != %D",P->N,C->N);
1303 
1304   /* For now, we do not dispatch based on the type of A and P */
1305   /* When implementations like _SeqAIJ_MAIJ exist, attack the multiple dispatch problem. */
1306   fA = A->ops->ptapnumeric;
1307   if (!fA) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for A of type %s",A->type_name);
1308   fP = P->ops->ptapnumeric;
1309   if (!fP) SETERRQ1(PETSC_ERR_SUP,"MatPtAPNumeric not supported for P of type %s",P->type_name);
1310   if (fP!=fA) SETERRQ2(PETSC_ERR_ARG_INCOMP,"MatPtAPNumeric requires A, %s, to be compatible with P, %s",A->type_name,P->type_name);
1311 
1312   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1313   ierr = (*fA)(A,P,C);CHKERRQ(ierr);
1314   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
1315 
1316   PetscFunctionReturn(0);
1317 }
1318 
1319 #undef __FUNCT__
1320 #define __FUNCT__ "MatPtAPNumeric_SeqAIJ_SeqAIJ"
1321 PetscErrorCode MatPtAPNumeric_SeqAIJ_SeqAIJ(Mat A,Mat P,Mat C)
1322 {
1323   PetscErrorCode ierr;
1324   PetscInt       flops=0;
1325   Mat_SeqAIJ     *a  = (Mat_SeqAIJ *) A->data;
1326   Mat_SeqAIJ     *p  = (Mat_SeqAIJ *) P->data;
1327   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
1328   PetscInt       *ai=a->i,*aj=a->j,*apj,*apjdense,*pi=p->i,*pj=p->j,*pJ=p->j,*pjj;
1329   PetscInt       *ci=c->i,*cj=c->j,*cjj;
1330   PetscInt       am=A->M,cn=C->N,cm=C->M;
1331   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1332   MatScalar      *aa=a->a,*apa,*pa=p->a,*pA=p->a,*paj,*ca=c->a,*caj;
1333 
1334   PetscFunctionBegin;
1335   /* Allocate temporary array for storage of one row of A*P */
1336   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1337   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
1338 
1339   apj      = (PetscInt *)(apa + cn);
1340   apjdense = apj + cn;
1341 
1342   /* Clear old values in C */
1343   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1344 
1345   for (i=0;i<am;i++) {
1346     /* Form sparse row of A*P */
1347     anzi  = ai[i+1] - ai[i];
1348     apnzj = 0;
1349     for (j=0;j<anzi;j++) {
1350       prow = *aj++;
1351       pnzj = pi[prow+1] - pi[prow];
1352       pjj  = pj + pi[prow];
1353       paj  = pa + pi[prow];
1354       for (k=0;k<pnzj;k++) {
1355         if (!apjdense[pjj[k]]) {
1356           apjdense[pjj[k]] = -1;
1357           apj[apnzj++]     = pjj[k];
1358         }
1359         apa[pjj[k]] += (*aa)*paj[k];
1360       }
1361       flops += 2*pnzj;
1362       aa++;
1363     }
1364 
1365     /* Sort the j index array for quick sparse axpy. */
1366     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1367 
1368     /* Compute P^T*A*P using outer product (P^T)[:,j]*(A*P)[j,:]. */
1369     pnzi = pi[i+1] - pi[i];
1370     for (j=0;j<pnzi;j++) {
1371       nextap = 0;
1372       crow   = *pJ++;
1373       cjj    = cj + ci[crow];
1374       caj    = ca + ci[crow];
1375       /* Perform sparse axpy operation.  Note cjj includes apj. */
1376       for (k=0;nextap<apnzj;k++) {
1377         if (cjj[k]==apj[nextap]) {
1378           caj[k] += (*pA)*apa[apj[nextap++]];
1379         }
1380       }
1381       flops += 2*apnzj;
1382       pA++;
1383     }
1384 
1385     /* Zero the current row info for A*P */
1386     for (j=0;j<apnzj;j++) {
1387       apa[apj[j]]      = 0.;
1388       apjdense[apj[j]] = 0;
1389     }
1390   }
1391 
1392   /* Assemble the final matrix and clean up */
1393   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1394   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1395   ierr = PetscFree(apa);CHKERRQ(ierr);
1396   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1397 
1398   PetscFunctionReturn(0);
1399 }
1400 
1401 /* Compute local C = P_loc^T * A * P - used by MatPtAP_MPIAIJ_MPIAIJ() */
1402 static PetscEvent logkey_matptapnumeric_local = 0;
1403 #undef __FUNCT__
1404 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ_Local"
1405 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,Mat C)
1406 {
1407   PetscErrorCode ierr;
1408   PetscInt       flops=0;
1409   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
1410   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1411   Mat_SeqAIJ     *c  = (Mat_SeqAIJ *) C->data;
1412   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
1413   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
1414   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col;
1415   PetscInt       *pJ=pj_loc,*pjj;
1416   PetscInt       *ci=c->i,*cj=c->j,*cjj;
1417   PetscInt       am=A->m,cn=C->N,cm=C->M;
1418   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
1419   MatScalar      *ada=ad->a,*aoa=ao->a,*apa,*paj,*ca=c->a,*caj;
1420   MatScalar      *pa_loc=p_loc->a,*pA=pa_loc,*pa_oth=p_oth->a;
1421 
1422   PetscFunctionBegin;
1423   if (!logkey_matptapnumeric_local) {
1424     ierr = PetscLogEventRegister(&logkey_matptapnumeric_local,"MatPtAPNumeric_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);CHKERRQ(ierr);
1425   }
1426   ierr = PetscLogEventBegin(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
1427 
1428   /* Allocate temporary array for storage of one row of A*P */
1429   ierr = PetscMalloc(cn*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
1430   ierr = PetscMemzero(apa,cn*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
1431   apj      = (PetscInt *)(apa + cn);
1432   apjdense = apj + cn;
1433 
1434   /* Clear old values in C */
1435   ierr = PetscMemzero(ca,ci[cm]*sizeof(MatScalar));CHKERRQ(ierr);
1436 
1437   for (i=0;i<am;i++) {
1438     /* Form i-th sparse row of A*P */
1439      apnzj = 0;
1440     /* diagonal portion of A */
1441     anzi  = adi[i+1] - adi[i];
1442     for (j=0;j<anzi;j++) {
1443       prow = *adj;
1444       adj++;
1445       pnzj = pi_loc[prow+1] - pi_loc[prow];
1446       pjj  = pj_loc + pi_loc[prow];
1447       paj  = pa_loc + pi_loc[prow];
1448       for (k=0;k<pnzj;k++) {
1449         if (!apjdense[pjj[k]]) {
1450           apjdense[pjj[k]] = -1;
1451           apj[apnzj++]     = pjj[k];
1452         }
1453         apa[pjj[k]] += (*ada)*paj[k];
1454       }
1455       flops += 2*pnzj;
1456       ada++;
1457     }
1458     /* off-diagonal portion of A */
1459     anzi  = aoi[i+1] - aoi[i];
1460     for (j=0;j<anzi;j++) {
1461       col = a->garray[*aoj];
1462       if (col < cstart){
1463         prow = *aoj;
1464       } else if (col >= cend){
1465         prow = *aoj;
1466       } else {
1467         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
1468       }
1469       aoj++;
1470       pnzj = pi_oth[prow+1] - pi_oth[prow];
1471       pjj  = pj_oth + pi_oth[prow];
1472       paj  = pa_oth + pi_oth[prow];
1473       for (k=0;k<pnzj;k++) {
1474         if (!apjdense[pjj[k]]) {
1475           apjdense[pjj[k]] = -1;
1476           apj[apnzj++]     = pjj[k];
1477         }
1478         apa[pjj[k]] += (*aoa)*paj[k];
1479       }
1480       flops += 2*pnzj;
1481       aoa++;
1482     }
1483     /* Sort the j index array for quick sparse axpy. */
1484     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
1485 
1486     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
1487     pnzi = pi_loc[i+1] - pi_loc[i];
1488     for (j=0;j<pnzi;j++) {
1489       nextap = 0;
1490       crow   = *pJ++;
1491       cjj    = cj + ci[crow];
1492       caj    = ca + ci[crow];
1493       /* Perform sparse axpy operation.  Note cjj includes apj. */
1494       for (k=0;nextap<apnzj;k++) {
1495         if (cjj[k]==apj[nextap]) {
1496           caj[k] += (*pA)*apa[apj[nextap++]];
1497         }
1498       }
1499       flops += 2*apnzj;
1500       pA++;
1501     }
1502 
1503     /* Zero the current row info for A*P */
1504     for (j=0;j<apnzj;j++) {
1505       apa[apj[j]]      = 0.;
1506       apjdense[apj[j]] = 0;
1507     }
1508   }
1509 
1510   /* Assemble the final matrix and clean up */
1511   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1512   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1513   ierr = PetscFree(apa);CHKERRQ(ierr);
1514   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
1515   ierr = PetscLogEventEnd(logkey_matptapnumeric_local,A,0,0,0);CHKERRQ(ierr);
1516   PetscFunctionReturn(0);
1517 }
1518 static PetscEvent logkey_matptapsymbolic_local = 0;
1519 #undef __FUNCT__
1520 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local"
1521 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local(Mat A,Mat P_loc,Mat P_oth,PetscInt prstart,PetscReal fill,Mat *C)
1522 {
1523   PetscErrorCode ierr;
1524   FreeSpaceList  free_space=PETSC_NULL,current_space=PETSC_NULL;
1525   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
1526   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*c;
1527   PetscInt       *pti,*ptj,*ptJ,*ajj,*pjj;
1528   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
1529   PetscInt       *ci,*cj,*ptaj;
1530   PetscInt       aN=A->N,am=A->m,pN=P_loc->N;
1531   PetscInt       i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi;
1532   PetscInt       pm=P_loc->m,nlnk,*lnk;
1533   MatScalar      *ca;
1534   PetscBT        lnkbt;
1535   PetscInt       prend,nprow_loc,nprow_oth;
1536   PetscInt       *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
1537   Mat_SeqAIJ     *p_loc=(Mat_SeqAIJ*)P_loc->data,*p_oth=(Mat_SeqAIJ*)P_oth->data;
1538   PetscInt       *pi_loc=p_loc->i,*pj_loc=p_loc->j,*pi_oth=p_oth->i,*pj_oth=p_oth->j;
1539 
1540   PetscFunctionBegin;
1541   if (!logkey_matptapsymbolic_local) {
1542     ierr = PetscLogEventRegister(&logkey_matptapsymbolic_local,"MatPtAPSymbolic_MPIAIJ_MPIAIJ_Local",MAT_COOKIE);
1543   }
1544   ierr = PetscLogEventBegin(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
1545 
1546   prend = prstart + pm;
1547 
1548   /* get ij structure of P_loc^T */
1549   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1550   ptJ=ptj;
1551 
1552   /* Allocate ci array, arrays for fill computation and */
1553   /* free space for accumulating nonzero column info */
1554   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
1555   ci[0] = 0;
1556 
1557   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
1558   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
1559   ptasparserow_loc = ptadenserow_loc + aN;
1560   ptadenserow_oth  = ptasparserow_loc + aN;
1561   ptasparserow_oth = ptadenserow_oth + aN;
1562 
1563   /* create and initialize a linked list */
1564   nlnk = pN+1;
1565   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1566 
1567   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
1568   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
1569   nnz           = adi[am] + aoi[am];
1570   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
1571   current_space = free_space;
1572 
1573   /* determine symbolic info for each row of C: */
1574   for (i=0;i<pN;i++) {
1575     ptnzi  = pti[i+1] - pti[i];
1576     nprow_loc = 0; nprow_oth = 0;
1577     /* i-th row of symbolic P_loc^T*A_loc: */
1578     for (j=0;j<ptnzi;j++) {
1579       arow = *ptJ++;
1580       /* diagonal portion of A */
1581       anzj = adi[arow+1] - adi[arow];
1582       ajj  = adj + adi[arow];
1583       for (k=0;k<anzj;k++) {
1584         col = ajj[k]+prstart;
1585         if (!ptadenserow_loc[col]) {
1586           ptadenserow_loc[col]    = -1;
1587           ptasparserow_loc[nprow_loc++] = col;
1588         }
1589       }
1590       /* off-diagonal portion of A */
1591       anzj = aoi[arow+1] - aoi[arow];
1592       ajj  = aoj + aoi[arow];
1593       for (k=0;k<anzj;k++) {
1594         col = a->garray[ajj[k]];  /* global col */
1595         if (col < cstart){
1596           col = ajj[k];
1597         } else if (col >= cend){
1598           col = ajj[k] + pm;
1599         } else {
1600           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
1601         }
1602         if (!ptadenserow_oth[col]) {
1603           ptadenserow_oth[col]    = -1;
1604           ptasparserow_oth[nprow_oth++] = col;
1605         }
1606       }
1607     }
1608 
1609     /* using symbolic info of local PtA, determine symbolic info for row of C: */
1610     cnzi   = 0;
1611     /* rows of P_loc */
1612     ptaj = ptasparserow_loc;
1613     for (j=0; j<nprow_loc; j++) {
1614       prow = *ptaj++;
1615       prow -= prstart; /* rm */
1616       pnzj = pi_loc[prow+1] - pi_loc[prow];
1617       pjj  = pj_loc + pi_loc[prow];
1618       /* add non-zero cols of P into the sorted linked list lnk */
1619       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1620       cnzi += nlnk;
1621     }
1622     /* rows of P_oth */
1623     ptaj = ptasparserow_oth;
1624     for (j=0; j<nprow_oth; j++) {
1625       prow = *ptaj++;
1626       if (prow >= prend) prow -= pm; /* rm */
1627       pnzj = pi_oth[prow+1] - pi_oth[prow];
1628       pjj  = pj_oth + pi_oth[prow];
1629       /* add non-zero cols of P into the sorted linked list lnk */
1630       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
1631       cnzi += nlnk;
1632     }
1633 
1634     /* If free space is not available, make more free space */
1635     /* Double the amount of total space in the list */
1636     if (current_space->local_remaining<cnzi) {
1637       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
1638     }
1639 
1640     /* Copy data into free space, and zero out denserows */
1641     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
1642     current_space->array           += cnzi;
1643     current_space->local_used      += cnzi;
1644     current_space->local_remaining -= cnzi;
1645 
1646     for (j=0;j<nprow_loc; j++) {
1647       ptadenserow_loc[ptasparserow_loc[j]] = 0;
1648     }
1649     for (j=0;j<nprow_oth; j++) {
1650       ptadenserow_oth[ptasparserow_oth[j]] = 0;
1651     }
1652 
1653     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
1654     /*        For now, we will recompute what is needed. */
1655     ci[i+1] = ci[i] + cnzi;
1656   }
1657   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
1658   /* Allocate space for cj, initialize cj, and */
1659   /* destroy list of free space and other temporary array(s) */
1660   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
1661   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
1662   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
1663   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
1664 
1665   /* Allocate space for ca */
1666   ierr = PetscMalloc((ci[pN]+1)*sizeof(MatScalar),&ca);CHKERRQ(ierr);
1667   ierr = PetscMemzero(ca,(ci[pN]+1)*sizeof(MatScalar));CHKERRQ(ierr);
1668 
1669   /* put together the new matrix */
1670   ierr = MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,pN,pN,ci,cj,ca,C);CHKERRQ(ierr);
1671 
1672   /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
1673   /* Since these are PETSc arrays, change flags to free them as necessary. */
1674   c = (Mat_SeqAIJ *)((*C)->data);
1675   c->freedata = PETSC_TRUE;
1676   c->nonew    = 0;
1677 
1678   /* Clean up. */
1679   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
1680 
1681   ierr = PetscLogEventEnd(logkey_matptapsymbolic_local,A,P_loc,P_oth,0);CHKERRQ(ierr);
1682   PetscFunctionReturn(0);
1683 }
1684