xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision e2e86b8fa849880032ce5571e3fa19670d4acd61)
1 /*
2   Defines projective product routines where A is a MPIAIJ matrix
3           C = P^T * A * P
4 */
5 
6 #include "src/mat/impls/aij/seq/aij.h"   /*I "petscmat.h" I*/
7 #include "src/mat/utils/freespace.h"
8 #include "src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
12 #undef __FUNCT__
13 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
14 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
15 {
16   PetscErrorCode       ierr;
17   Mat_MatMatMultMPI    *ptap;
18   PetscObjectContainer container;
19 
20   PetscFunctionBegin;
21   ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
22   if (container) {
23     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
24     ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr);
25     ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr);
26     ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr);
27     ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr);
28     if (ptap->abi) ierr = PetscFree(ptap->abi);CHKERRQ(ierr);
29     if (ptap->abj) ierr = PetscFree(ptap->abj);CHKERRQ(ierr);
30 
31     ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
32     ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr);
33   }
34   ierr = PetscFree(ptap);CHKERRQ(ierr);
35 
36   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
37   PetscFunctionReturn(0);
38 }
39 
40 #undef __FUNCT__
41 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
42 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
43 {
44   PetscErrorCode       ierr;
45   Mat_MatMatMultMPI    *ptap;
46   PetscObjectContainer container;
47 
48   PetscFunctionBegin;
49   if (scall == MAT_INITIAL_MATRIX){
50     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
51     ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr);
52     ptap->abi=PETSC_NULL; ptap->abj=PETSC_NULL;
53     ptap->abnz_max = 0; /* symbolic A*P is not done yet */
54 
55     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
56     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
57 
58     /* get P_loc by taking all local rows of P */
59     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
60 
61     /* attach the supporting struct to P for reuse */
62     P->ops->destroy  = MatDestroy_MPIAIJ_PtAP;
63     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
64     ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr);
65     ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr);
66 
67     /* now, compute symbolic local P^T*A*P */
68     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */
69     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
70   } else if (scall == MAT_REUSE_MATRIX){
71     ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
72     if (container) {
73       ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
74     } else {
75       SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
76     }
77 
78     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
79     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
80 
81     /* get P_loc by taking all local rows of P */
82     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
83 
84   } else {
85     SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall);
86   }
87   /* now, compute numeric local P^T*A*P */
88   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
89   ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
90   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
91 
92   PetscFunctionReturn(0);
93 }
94 
95 #undef __FUNCT__
96 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
97 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
98 {
99   PetscErrorCode       ierr;
100   Mat                  P_loc,P_oth;
101   Mat_MatMatMultMPI    *ptap;
102   PetscObjectContainer container;
103   FreeSpaceList        free_space=PETSC_NULL,current_space=PETSC_NULL;
104   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
105   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
106   Mat_SeqAIJ           *p_loc,*p_oth;
107   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*pjj;
108   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz;
109   PetscInt             pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji;
110   PetscInt             aN=A->N,am=A->m,pN=P->N;
111   PetscInt             i,j,k,ptnzi,arow,prow,pnzj,cnzi;
112   PetscBT              lnkbt;
113   PetscInt             prstart,prend;
114   MPI_Comm             comm=A->comm;
115   Mat                  B_mpi;
116   PetscMPIInt          size,rank,tagi,tagj,*len_s,*len_si,*len_ri;
117   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
118   PetscInt             len,proc,*dnz,*onz,*owners;
119   PetscInt             anzi,*bi,*bj,bnzi,nspacedouble=0;
120   PetscInt             nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
121   MPI_Request          *si_waits,*sj_waits,*ri_waits,*rj_waits;
122   MPI_Status           *status;
123   Mat_Merge_SeqsToMPI  *merge;
124   PetscInt             *apsymi,*apsymj,*apj,apnzj,*rmap=p->garray,tnrows;
125   /*  PetscInt             tnzrows,pcstart=p->cstart,pcend=p->cend; */
126 
127   PetscFunctionBegin;
128   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
129   if (container) {
130     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
131   } else {
132     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
133   }
134 
135   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
136   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
137 
138   /* compute symbolic C_seq = P_loc^T * A_loc * P */
139   /*----------------------------------------------*/
140   /* get data from P_loc and P_oth */
141   P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart;
142   p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
143   pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
144   prend = prstart + pm;
145 
146   /* first, compute symbolic AP = A_loc*P */
147   ierr  = PetscMalloc((am+2)*sizeof(PetscInt),&apsymi);CHKERRQ(ierr);
148   ptap->abi = apsymi;
149   apsymi[0] = 0;
150 
151   /* create and initialize a linked list */
152   nlnk = pN+1;
153   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
154 
155   /* Initial FreeSpace size is 2*nnz(A) */
156   ierr = GetMoreSpace((PetscInt)(2*(adi[am]+aoi[am])),&free_space);CHKERRQ(ierr);
157   current_space = free_space;
158 
159   for (i=0;i<am;i++) {
160     apnzj = 0;
161     /* diagonal portion of A */
162     anzi = adi[i+1] - adi[i];
163     for (j=0; j<anzi; j++){
164       prow = *adj++;
165       pnzj = pi_loc[prow+1] - pi_loc[prow];
166       pjj  = pj_loc + pi_loc[prow];
167       /* add non-zero cols of P into the sorted linked list lnk */
168       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
169       apnzj += nlnk;
170     }
171     /* off-diagonal portion of A */
172     anzi = aoi[i+1] - aoi[i];
173     for (j=0; j<anzi; j++){
174       prow = *aoj++;
175       pnzj = pi_oth[prow+1] - pi_oth[prow];
176       pjj  = pj_oth + pi_oth[prow];
177       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
178       apnzj += nlnk;
179     }
180 
181     apsymi[i+1] = apsymi[i] + apnzj;
182     if (ptap->abnz_max < apnzj) ptap->abnz_max = apnzj;
183 
184     /* if free space is not available, double the total space in the list */
185     if (current_space->local_remaining<apnzj) {
186       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
187     }
188 
189     /* Copy data into free space, then initialize lnk */
190     ierr = PetscLLClean(pN,pN,apnzj,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
191     current_space->array           += apnzj;
192     current_space->local_used      += apnzj;
193     current_space->local_remaining -= apnzj;
194   }
195   /* Allocate space for apsymj, initialize apsymj, and */
196   /* destroy list of free space and other temporary array(s) */
197   ierr = PetscMalloc((apsymi[am]+1)*sizeof(PetscInt),&ptap->abj);CHKERRQ(ierr);
198   apsymj = ptap->abj;
199   ierr = MakeSpaceContiguous(&free_space,ptap->abj);CHKERRQ(ierr);
200 
201   /* get ij structure of P_loc^T */
202   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
203 
204   /* then, compute symbolic C_seq = P_loc^T*AP */
205   /* Allocate ci array, arrays for fill computation and */
206   /* free space for accumulating nonzero column info */
207   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
208   ci[0] = 0;
209 
210   /* tnzrows = 0; */
211   tnrows = (p->B)->N; /* total num of rows to be sent to other processors
212                          >= (num of nonzero rows of C_seq) - pn */
213 
214 
215   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
216   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
217   nnz           = adi[am] + aoi[am];
218   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
219   current_space = free_space;
220 
221   /* determine symbolic info for each row of C: */
222   for (i=0; i<pN; i++) {
223     cnzi  = 0;
224     ptnzi = pti[i+1] - pti[i];
225     if (ptnzi){
226     j     = ptnzi;
227     ptJ   = ptj + pti[i+1];
228     while (j){/* assume cols are almost in increasing order, starting from its end saves computation */
229       j--; ptJ--;
230       arow  = *ptJ; /* row of AP == col of Pt */
231       apnzj = apsymi[arow+1] - apsymi[arow];
232       apj   = apsymj + apsymi[arow];
233       /* add non-zero cols of AP into the sorted linked list lnk */
234       ierr = PetscLLAdd(apnzj,apj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
235       cnzi += nlnk;
236     }
237 
238     /* If free space is not available, double the total space in the list */
239     if (current_space->local_remaining<cnzi) {
240       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
241     }
242 
243     /* Copy data into free space, and zero out denserows */
244     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
245     current_space->array           += cnzi;
246     current_space->local_used      += cnzi;
247     current_space->local_remaining -= cnzi;
248     /* tnzrows++; */
249     } /* if (ptnzi) */
250     ci[i+1] = ci[i] + cnzi;
251 
252   }
253   /* printf("[%d] tnzrows-pn: %d,tnrows: %d, pN: %d; pcstart/end: %d, %d\n",rank,tnzrows-pn,tnrows,pN,pcstart,pcend);*/
254 
255   /* Clean up. */
256   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
257 
258   /* Allocate space for cj, initialize cj, and */
259   /* destroy list of free space and other temporary array(s) */
260   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
261   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
262   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
263 
264   /* add C_seq into mpi C              */
265   /*-----------------------------------*/
266   /* determine row ownership */
267   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
268   ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr);
269   ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr);
270   ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr);
271   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
272 
273   /* determine the number of messages to send, their lengths */
274   /*---------------------------------------------------------*/
275   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
276   ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr);
277   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
278   len_s  = merge->len_s;
279   len = 0;  /* max length of buf_si[] */
280   merge->nsend = 0;
281 
282   proc = 0;
283   for (i=0; i<tnrows; i++){
284     while (rmap[i] >= owners[proc+1]) proc++;
285     len_si[proc]++;
286   }
287   for (proc=0; proc<size; proc++){
288     len_s[proc] = 0;
289     if (len_si[proc]){
290       merge->nsend++;
291       len_si[proc] = 2*(len_si[proc] + 1);
292       len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of col indices to be sent to [proc] */
293       len += len_si[proc];
294     }
295   }
296 
297   /* determine the number and length of messages to receive for ij-structure */
298   /*-------------------------------------------------------------------------*/
299   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
300   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
301 
302   /* post the Irecv of j-structure */
303   /*-------------------------------*/
304   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr);
305   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr);
306 
307   /* post the Isend of j-structure */
308   /*--------------------------------*/
309   ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr);
310   sj_waits = si_waits + merge->nsend;
311 
312   for (proc=0, k=0; proc<size; proc++){
313     if (!len_s[proc]) continue;
314     i = owners[proc];
315     ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr);
316     k++;
317   }
318 
319   /* receives and sends of j-structure are complete */
320   /*------------------------------------------------*/
321   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
322   ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr);
323   ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr);
324 
325   /* send and recv i-structure */
326   /*---------------------------*/
327   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr);
328   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr);
329 
330   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
331   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
332   for (proc=0,k=0; proc<size; proc++){
333     if (!len_s[proc]) continue;
334     /* form outgoing message for i-structure:
335          buf_si[0]:                 nrows to be sent
336                [1:nrows]:           row index (global)
337                [nrows+1:2*nrows+1]: i-structure index
338     */
339     /*-------------------------------------------*/
340     nrows = len_si[proc]/2 - 1;
341     buf_si_i    = buf_si + nrows+1;
342     buf_si[0]   = nrows;
343     buf_si_i[0] = 0;
344     nrows = 0;
345     for (i=owners[proc]; i<owners[proc+1]; i++){
346       anzi = ci[i+1] - ci[i];
347       if (anzi) {
348         buf_si_i[nrows+1] = buf_si_i[nrows] + anzi; /* i-structure */
349         buf_si[nrows+1] = i-owners[proc]; /* local row index */
350         nrows++;
351       }
352     }
353     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr);
354     k++;
355     buf_si += len_si[proc];
356   }
357 
358   ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr);
359   ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr);
360 
361   ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr);
362   for (i=0; i<merge->nrecv; i++){
363     ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI:   recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr);
364   }
365 
366   ierr = PetscFree(len_si);CHKERRQ(ierr);
367   ierr = PetscFree(len_ri);CHKERRQ(ierr);
368   ierr = PetscFree(rj_waits);CHKERRQ(ierr);
369   ierr = PetscFree(si_waits);CHKERRQ(ierr);
370   ierr = PetscFree(ri_waits);CHKERRQ(ierr);
371   ierr = PetscFree(buf_s);CHKERRQ(ierr);
372   ierr = PetscFree(status);CHKERRQ(ierr);
373 
374   /* compute a local seq matrix in each processor */
375   /*----------------------------------------------*/
376   /* allocate bi array and free space for accumulating nonzero column info */
377   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
378   bi[0] = 0;
379 
380   /* create and initialize a linked list */
381   nlnk = pN+1;
382   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
383 
384   /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */
385   free_space=PETSC_NULL; current_space=PETSC_NULL;
386   len = 0;
387   len  = ci[owners[rank+1]] - ci[owners[rank]];
388   ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr);
389   current_space = free_space;
390 
391   /* determine symbolic info for each local row */
392   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
393   nextrow = buf_ri_k + merge->nrecv;
394   nextci  = nextrow + merge->nrecv;
395   for (k=0; k<merge->nrecv; k++){
396     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
397     nrows = *buf_ri_k[k];
398     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
399     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
400   }
401 
402   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
403   len = 0;
404   for (i=0; i<pn; i++) {
405     bnzi   = 0;
406     /* add local non-zero cols of this proc's C_seq into lnk */
407     arow   = owners[rank] + i;
408     anzi   = ci[arow+1] - ci[arow];
409     cji    = cj + ci[arow];
410     ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
411     bnzi += nlnk;
412     /* add received col data into lnk */
413     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
414       if (i == *nextrow[k]) { /* i-th row */
415         anzi = *(nextci[k]+1) - *nextci[k];
416         cji  = buf_rj[k] + *nextci[k];
417         ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
418         bnzi += nlnk;
419         nextrow[k]++; nextci[k]++;
420       }
421     }
422     if (len < bnzi) len = bnzi;  /* =max(bnzi) */
423 
424     /* if free space is not available, make more free space */
425     if (current_space->local_remaining<bnzi) {
426       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
427       nspacedouble++;
428     }
429     /* copy data into free space, then initialize lnk */
430     ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
431     ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr);
432 
433     current_space->array           += bnzi;
434     current_space->local_used      += bnzi;
435     current_space->local_remaining -= bnzi;
436 
437     bi[i+1] = bi[i] + bnzi;
438   }
439 
440   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
441 
442   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
443   ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
444   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
445 
446   /* create symbolic parallel matrix B_mpi */
447   /*---------------------------------------*/
448   ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr);
449   ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr);
450   ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr);
451   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
452   /* ierr = MatSetOption(B_mpi,MAT_COLUMNS_SORTED);CHKERRQ(ierr); -cause delay? */
453 
454   /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */
455   B_mpi->assembled     = PETSC_FALSE;
456   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_SeqsToMPI;
457   merge->bi            = bi;
458   merge->bj            = bj;
459   merge->ci            = ci;
460   merge->cj            = cj;
461   merge->buf_ri        = buf_ri;
462   merge->buf_rj        = buf_rj;
463   merge->C_seq         = PETSC_NULL;
464 
465   /* attach the supporting struct to B_mpi for reuse */
466   ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
467   ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr);
468   ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr);
469   *C = B_mpi;
470 
471   PetscFunctionReturn(0);
472 }
473 
474 #undef __FUNCT__
475 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
476 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
477 {
478   PetscErrorCode       ierr;
479   Mat_Merge_SeqsToMPI  *merge;
480   Mat_MatMatMultMPI    *ptap;
481   PetscObjectContainer cont_merge,cont_ptap;
482 
483   PetscInt       flops=0;
484   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
485   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
486   Mat_SeqAIJ     *p_loc,*p_oth;
487   PetscInt       *adi=ad->i,*aoi=ao->i,*apj,cstart=a->cstart,cend=a->cend,col;
488   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pjj;
489   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
490   PetscInt       *cjj;
491   MatScalar      *ada=ad->a,*aoa=ao->a,*apa,*paj,*cseqa,*caj;
492   MatScalar      *pa_loc,*pA,*pa_oth;
493   PetscInt       am=A->m,cN=C->N;
494   PetscInt       nextp,*adj=ad->j,*aoj=ao->j;
495   MPI_Comm             comm=C->comm;
496   PetscMPIInt          size,rank,taga,*len_s;
497   PetscInt             *owners;
498   PetscInt             proc;
499   PetscInt             **buf_ri,**buf_rj;
500   PetscInt             cseqnzi,*bj_i,*bi,*bj,cseqrow,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */
501   PetscInt             nrows,**buf_ri_k,**nextrow,**nextcseqi;
502   MPI_Request          *s_waits,*r_waits;
503   MPI_Status           *status;
504   MatScalar            **abuf_r,*ba_i;
505   PetscInt             *cseqi,*cseqj;
506   PetscInt             *cseqj_tmp;
507   MatScalar            *cseqa_tmp;
508   PetscInt             stages[2];
509   PetscInt             *apsymi,*apsymj;
510 
511   PetscFunctionBegin;
512   ierr = PetscLogStageRegister(&stages[0],"NumPtAP_local");CHKERRQ(ierr);
513   ierr = PetscLogStageRegister(&stages[1],"NumPtAP_Comm");CHKERRQ(ierr);
514 
515   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
516   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
517 
518   ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr);
519   if (cont_merge) {
520     ierr  = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr);
521   } else {
522     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
523   }
524   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr);
525   if (cont_ptap) {
526     ierr  = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr);
527   } else {
528     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
529   }
530 
531   /* get data from symbolic products */
532   p_loc=(Mat_SeqAIJ*)(ptap->B_loc)->data;
533   p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data;
534   pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a; pA=pa_loc;
535   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
536 
537   cseqi = merge->ci; cseqj=merge->cj;
538   ierr  = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr);
539   ierr  = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
540 
541   /* get data from symbolic A*P */
542   ierr = PetscMalloc((ptap->abnz_max+1)*sizeof(MatScalar),&apa);CHKERRQ(ierr);
543   ierr = PetscMemzero(apa,ptap->abnz_max*sizeof(MatScalar));CHKERRQ(ierr);
544 
545   /* compute numeric C_seq=P_loc^T*A_loc*P */
546   ierr = PetscLogStagePush(stages[0]);CHKERRQ(ierr);
547   apsymi = ptap->abi; apsymj = ptap->abj;
548   for (i=0;i<am;i++) {
549     /* form i-th sparse row of A*P */
550     apnzj = apsymi[i+1] - apsymi[i];
551     apj   = apsymj + apsymi[i];
552     /* diagonal portion of A */
553     anzi  = adi[i+1] - adi[i];
554     for (j=0;j<anzi;j++) {
555       prow = *adj;
556       adj++;
557       pnzj = pi_loc[prow+1] - pi_loc[prow];
558       pjj  = pj_loc + pi_loc[prow];
559       paj  = pa_loc + pi_loc[prow];
560       nextp = 0;
561       for (k=0; nextp<pnzj; k++) {
562         if (apj[k] == pjj[nextp]) { /* col of AP == col of P */
563           apa[k] += (*ada)*paj[nextp++];
564         }
565       }
566       flops += 2*pnzj;
567       ada++;
568     }
569     /* off-diagonal portion of A */
570     anzi  = aoi[i+1] - aoi[i];
571     for (j=0;j<anzi;j++) {
572       col = a->garray[*aoj];
573       if (col < cstart){
574         prow = *aoj;
575       } else if (col >= cend){
576         prow = *aoj;
577       } else {
578         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
579       }
580       aoj++;
581       pnzj = pi_oth[prow+1] - pi_oth[prow];
582       pjj  = pj_oth + pi_oth[prow];
583       paj  = pa_oth + pi_oth[prow];
584       nextp = 0;
585       for (k=0; nextp<pnzj; k++) {
586         if (apj[k] == pjj[nextp]) { /* col of AP == col of P */
587           apa[k] += (*aoa)*paj[nextp++];
588         }
589       }
590       flops += 2*pnzj;
591       aoa++;
592     }
593 
594     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
595     pnzi = pi_loc[i+1] - pi_loc[i];
596     for (j=0;j<pnzi;j++) {
597       nextap = 0;
598       crow   = *pJ++;
599       cjj    = cseqj + cseqi[crow];
600       caj    = cseqa + cseqi[crow];
601       /* Perform sparse axpy operation.  Note cjj includes apj. */
602       for (k=0;nextap<apnzj;k++) {
603         if (cjj[k]==apj[nextap]) caj[k] += (*pA)*apa[nextap++];
604       }
605       flops += 2*apnzj;
606       pA++;
607     }
608     /* zero the current row info for A*P */
609     ierr = PetscMemzero(apa,apnzj*sizeof(MatScalar));CHKERRQ(ierr);
610   }
611 
612   ierr = PetscFree(apa);CHKERRQ(ierr);
613   ierr = PetscLogStagePop();
614 
615   bi     = merge->bi;
616   bj     = merge->bj;
617   buf_ri = merge->buf_ri;
618   buf_rj = merge->buf_rj;
619 
620   ierr   = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
621   ierr   = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
622 
623   /* send and recv matrix values */
624   /*-----------------------------*/
625   ierr = PetscLogStagePush(stages[1]);CHKERRQ(ierr);
626   len_s  = merge->len_s;
627   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr);
628   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
629 
630   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr);
631   for (proc=0,k=0; proc<size; proc++){
632     if (!len_s[proc]) continue;
633     i = owners[proc];
634     ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
635     k++;
636   }
637 
638   ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);
639   ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);
640   ierr = PetscFree(status);CHKERRQ(ierr);
641 
642   ierr = PetscFree(s_waits);CHKERRQ(ierr);
643   ierr = PetscFree(r_waits);CHKERRQ(ierr);
644 
645   /* insert mat values of mpimat */
646   /*----------------------------*/
647   ierr = PetscMalloc(cN*sizeof(MatScalar),&ba_i);CHKERRQ(ierr);
648   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
649   nextrow = buf_ri_k + merge->nrecv;
650   nextcseqi  = nextrow + merge->nrecv;
651 
652   for (k=0; k<merge->nrecv; k++){
653     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
654     nrows = *(buf_ri_k[k]);
655     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
656     nextcseqi[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
657   }
658 
659   /* set values of ba */
660   for (i=0; i<C->m; i++) {
661     cseqrow = owners[rank] + i; /* global row index of C_seq */
662     bj_i = bj+bi[i];  /* col indices of the i-th row of C */
663     bnzi = bi[i+1] - bi[i];
664     ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr);
665 
666     /* add local non-zero vals of this proc's C_seq into ba */
667     cseqnzi = cseqi[cseqrow+1] - cseqi[cseqrow];
668     cseqj_tmp = cseqj + cseqi[cseqrow];
669     cseqa_tmp = cseqa + cseqi[cseqrow];
670     nextcseqj = 0;
671     for (j=0; nextcseqj<cseqnzi; j++){
672       if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
673         ba_i[j] += cseqa_tmp[nextcseqj++];
674       }
675     }
676 
677     /* add received vals into ba */
678     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
679       /* i-th row */
680       if (i == *nextrow[k]) {
681         cseqnzi   = *(nextcseqi[k]+1) - *nextcseqi[k];
682         cseqj_tmp = buf_rj[k] + *(nextcseqi[k]);
683         cseqa_tmp = abuf_r[k] + *(nextcseqi[k]);
684         nextcseqj = 0;
685         for (j=0; nextcseqj<cseqnzi; j++){
686           if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
687             ba_i[j] += cseqa_tmp[nextcseqj++];
688           }
689         }
690         nextrow[k]++; nextcseqi[k]++;
691       }
692     }
693     ierr = MatSetValues(C,1,&cseqrow,bnzi,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
694     flops += 2*bnzi;
695   }
696   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
697   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
698   ierr = PetscLogStagePop();CHKERRQ(ierr);
699 
700   ierr = PetscFree(cseqa);CHKERRQ(ierr);
701   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
702   ierr = PetscFree(ba_i);CHKERRQ(ierr);
703   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
704   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
705 
706   PetscFunctionReturn(0);
707 }
708