xref: /petsc/src/mat/impls/aij/mpi/mpiptap.c (revision 42c5748986aa8361ae79e4b75960f54f11bcd0d9)
1 /*
2   Defines projective product routines where A is a MPIAIJ matrix
3           C = P^T * A * P
4 */
5 
6 #include "src/mat/impls/aij/seq/aij.h"   /*I "petscmat.h" I*/
7 #include "src/mat/utils/freespace.h"
8 #include "src/mat/impls/aij/mpi/mpiaij.h"
9 #include "petscbt.h"
10 
11 EXTERN PetscErrorCode MatDestroy_MPIAIJ(Mat);
12 #undef __FUNCT__
13 #define __FUNCT__ "MatDestroy_MPIAIJ_PtAP"
14 PetscErrorCode MatDestroy_MPIAIJ_PtAP(Mat A)
15 {
16   PetscErrorCode       ierr;
17   Mat_MatMatMultMPI    *ptap;
18   PetscObjectContainer container;
19 
20   PetscFunctionBegin;
21   ierr = PetscObjectQuery((PetscObject)A,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
22   if (container) {
23     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
24     ierr = MatDestroy(ptap->B_loc);CHKERRQ(ierr);
25     ierr = MatDestroy(ptap->B_oth);CHKERRQ(ierr);
26     ierr = ISDestroy(ptap->isrowb);CHKERRQ(ierr);
27     ierr = ISDestroy(ptap->iscolb);CHKERRQ(ierr);
28     if (ptap->abi) ierr = PetscFree(ptap->abi);CHKERRQ(ierr);
29     if (ptap->abj) ierr = PetscFree(ptap->abj);CHKERRQ(ierr);
30 
31     ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
32     ierr = PetscObjectCompose((PetscObject)A,"MatPtAPMPI",0);CHKERRQ(ierr);
33   }
34   ierr = PetscFree(ptap);CHKERRQ(ierr);
35 
36   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
37   PetscFunctionReturn(0);
38 }
39 
40 #undef __FUNCT__
41 #define __FUNCT__ "MatPtAP_MPIAIJ_MPIAIJ"
42 PetscErrorCode MatPtAP_MPIAIJ_MPIAIJ(Mat A,Mat P,MatReuse scall,PetscReal fill,Mat *C)
43 {
44   PetscErrorCode       ierr;
45   Mat_MatMatMultMPI    *ptap;
46   PetscObjectContainer container;
47 
48   PetscFunctionBegin;
49   if (scall == MAT_INITIAL_MATRIX){
50     ierr = PetscLogEventBegin(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
51     ierr = PetscNew(Mat_MatMatMultMPI,&ptap);CHKERRQ(ierr);
52     ptap->abi=PETSC_NULL; ptap->abj=PETSC_NULL;
53     ptap->abnz_max = 0; /* symbolic A*P is not done yet */
54 
55     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
56     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
57 
58     /* get P_loc by taking all local rows of P */
59     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
60 
61     /* attach the supporting struct to P for reuse */
62     P->ops->destroy  = MatDestroy_MPIAIJ_PtAP;
63     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
64     ierr = PetscObjectContainerSetPointer(container,ptap);CHKERRQ(ierr);
65     ierr = PetscObjectCompose((PetscObject)P,"MatPtAPMPI",(PetscObject)container);CHKERRQ(ierr);
66 
67     /* now, compute symbolic local P^T*A*P */
68     ierr = MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);CHKERRQ(ierr);/* numeric product is computed as well */
69     ierr = PetscLogEventEnd(MAT_PtAPSymbolic,A,P,0,0);CHKERRQ(ierr);
70   } else if (scall == MAT_REUSE_MATRIX){
71     ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
72     if (container) {
73       ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
74     } else {
75       SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
76     }
77 
78     /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
79     ierr = MatGetBrowsOfAoCols(A,P,scall,&ptap->isrowb,&ptap->iscolb,&ptap->brstart,&ptap->B_oth);CHKERRQ(ierr);
80 
81     /* get P_loc by taking all local rows of P */
82     ierr = MatGetLocalMat(P,scall,&ptap->B_loc);CHKERRQ(ierr);
83 
84   } else {
85     SETERRQ1(PETSC_ERR_ARG_WRONG,"Invalid MatReuse %d",scall);
86   }
87   /* now, compute numeric local P^T*A*P */
88   ierr = PetscLogEventBegin(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
89   ierr = MatPtAPNumeric_MPIAIJ_MPIAIJ(A,P,*C);CHKERRQ(ierr);
90   ierr = PetscLogEventEnd(MAT_PtAPNumeric,A,P,0,0);CHKERRQ(ierr);
91 
92   PetscFunctionReturn(0);
93 }
94 
95 #undef __FUNCT__
96 #define __FUNCT__ "MatPtAPSymbolic_MPIAIJ_MPIAIJ"
97 PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C)
98 {
99   PetscErrorCode       ierr;
100   Mat                  P_loc,P_oth;
101   Mat_MatMatMultMPI    *ptap;
102   PetscObjectContainer container;
103   FreeSpaceList        free_space=PETSC_NULL,current_space=PETSC_NULL;
104   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data;
105   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
106   Mat_SeqAIJ           *p_loc,*p_oth;
107   PetscInt             *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pti,*ptj,*ptJ,*ajj,*pjj;
108   PetscInt             *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,nnz,cstart=a->cstart,cend=a->cend,col;
109   PetscInt             pm=P->m,pn=P->n,nlnk,*lnk,*ci,*cj,*cji,*ptaj;
110   PetscInt             aN=A->N,am=A->m,pN=P->N;
111   PetscInt             i,j,k,ptnzi,arow,anzj,prow,pnzj,cnzi;
112   PetscBT              lnkbt;
113   PetscInt             prstart,prend,nprow_loc,nprow_oth;
114   PetscInt             *ptadenserow_loc,*ptadenserow_oth,*ptasparserow_loc,*ptasparserow_oth;
115 
116   MPI_Comm             comm=A->comm;
117   Mat                  B_mpi;
118   PetscMPIInt          size,rank,tagi,tagj,*len_s,*len_si,*len_ri;
119   PetscInt             **buf_rj,**buf_ri,**buf_ri_k;
120   PetscInt             len,proc,*dnz,*onz,*owners;
121   PetscInt             anzi,*bi,*bj,bnzi,nspacedouble=0;
122   PetscInt             nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
123   MPI_Request          *si_waits,*sj_waits,*ri_waits,*rj_waits;
124   MPI_Status           *status;
125   Mat_Merge_SeqsToMPI  *merge;
126 
127   PetscFunctionBegin;
128   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&container);CHKERRQ(ierr);
129   if (container) {
130     ierr  = PetscObjectContainerGetPointer(container,(void **)&ptap);CHKERRQ(ierr);
131   } else {
132     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
133   }
134   /* compute symbolic C_seq = P_loc^T * A_loc * P_seq */
135   /*--------------------------------------------------*/
136   /* get data from P_loc and P_oth */
137   P_loc=ptap->B_loc; P_oth=ptap->B_oth; prstart=ptap->brstart;
138   p_loc=(Mat_SeqAIJ*)P_loc->data; p_oth=(Mat_SeqAIJ*)P_oth->data;
139   pi_loc=p_loc->i; pj_loc=p_loc->j; pi_oth=p_oth->i; pj_oth=p_oth->j;
140   prend = prstart + pm;
141 
142   /* get ij structure of P_loc^T */
143   ierr = MatGetSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
144   ptJ=ptj;
145 
146   /* Allocate ci array, arrays for fill computation and */
147   /* free space for accumulating nonzero column info */
148   ierr = PetscMalloc((pN+1)*sizeof(PetscInt),&ci);CHKERRQ(ierr);
149   ci[0] = 0;
150 
151   ierr = PetscMalloc((4*aN+1)*sizeof(PetscInt),&ptadenserow_loc);CHKERRQ(ierr);
152   ierr = PetscMemzero(ptadenserow_loc,(4*aN+1)*sizeof(PetscInt));CHKERRQ(ierr);
153   ptasparserow_loc = ptadenserow_loc + aN;
154   ptadenserow_oth  = ptasparserow_loc + aN;
155   ptasparserow_oth = ptadenserow_oth + aN;
156 
157   /* create and initialize a linked list */
158   nlnk = pN+1;
159   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
160 
161   /* Set initial free space to be nnz(A) scaled by fill*P->N/P->M. */
162   /* This should be reasonable if sparsity of PtAP is similar to that of A. */
163   nnz           = adi[am] + aoi[am];
164   ierr          = GetMoreSpace((PetscInt)(fill*nnz*pN/aN+1),&free_space);
165   current_space = free_space;
166 
167   /* determine symbolic info for each row of C: */
168   for (i=0;i<pN;i++) {
169     ptnzi  = pti[i+1] - pti[i];
170     nprow_loc = 0; nprow_oth = 0;
171     /* i-th row of symbolic P_loc^T*A_loc: */
172     for (j=0;j<ptnzi;j++) {
173       arow = *ptJ++;
174       /* diagonal portion of A */
175       anzj = adi[arow+1] - adi[arow];
176       ajj  = adj + adi[arow];
177       for (k=0;k<anzj;k++) {
178         col = ajj[k]+prstart;
179         if (!ptadenserow_loc[col]) {
180           ptadenserow_loc[col]    = -1;
181           ptasparserow_loc[nprow_loc++] = col;
182         }
183       }
184       /* off-diagonal portion of A */
185       anzj = aoi[arow+1] - aoi[arow];
186       ajj  = aoj + aoi[arow];
187       for (k=0;k<anzj;k++) {
188         col = a->garray[ajj[k]];  /* global col */
189         if (col < cstart){
190           col = ajj[k];
191         } else if (col >= cend){
192           col = ajj[k] + pm;
193         } else {
194           SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
195         }
196         if (!ptadenserow_oth[col]) {
197           ptadenserow_oth[col]    = -1;
198           ptasparserow_oth[nprow_oth++] = col;
199         }
200       }
201     }
202 
203     /* using symbolic info of local PtA, determine symbolic info for row of C: */
204     cnzi   = 0;
205     /* rows of P_loc */
206     ptaj = ptasparserow_loc;
207     for (j=0; j<nprow_loc; j++) {
208       prow = *ptaj++;
209       prow -= prstart; /* rm */
210       pnzj = pi_loc[prow+1] - pi_loc[prow];
211       pjj  = pj_loc + pi_loc[prow];
212       /* add non-zero cols of P into the sorted linked list lnk */
213       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
214       cnzi += nlnk;
215     }
216     /* rows of P_oth */
217     ptaj = ptasparserow_oth;
218     for (j=0; j<nprow_oth; j++) {
219       prow = *ptaj++;
220       if (prow >= prend) prow -= pm; /* rm */
221       pnzj = pi_oth[prow+1] - pi_oth[prow];
222       pjj  = pj_oth + pi_oth[prow];
223       /* add non-zero cols of P into the sorted linked list lnk */
224       ierr = PetscLLAdd(pnzj,pjj,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
225       cnzi += nlnk;
226     }
227 
228     /* If free space is not available, make more free space */
229     /* Double the amount of total space in the list */
230     if (current_space->local_remaining<cnzi) {
231       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
232     }
233 
234     /* Copy data into free space, and zero out denserows */
235     ierr = PetscLLClean(pN,pN,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
236     current_space->array           += cnzi;
237     current_space->local_used      += cnzi;
238     current_space->local_remaining -= cnzi;
239 
240     for (j=0;j<nprow_loc; j++) {
241       ptadenserow_loc[ptasparserow_loc[j]] = 0;
242     }
243     for (j=0;j<nprow_oth; j++) {
244       ptadenserow_oth[ptasparserow_oth[j]] = 0;
245     }
246 
247     /* Aside: Perhaps we should save the pta info for the numerical factorization. */
248     /*        For now, we will recompute what is needed. */
249     ci[i+1] = ci[i] + cnzi;
250   }
251   /* Clean up. */
252   ierr = MatRestoreSymbolicTranspose_SeqAIJ(P_loc,&pti,&ptj);CHKERRQ(ierr);
253 
254   /* nnz is now stored in ci[ptm], column indices are in the list of free space */
255   /* Allocate space for cj, initialize cj, and */
256   /* destroy list of free space and other temporary array(s) */
257   ierr = PetscMalloc((ci[pN]+1)*sizeof(PetscInt),&cj);CHKERRQ(ierr);
258   ierr = MakeSpaceContiguous(&free_space,cj);CHKERRQ(ierr);
259   ierr = PetscFree(ptadenserow_loc);CHKERRQ(ierr);
260   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
261 
262   /* add C_seq into mpi C              */
263   /*-----------------------------------*/
264   free_space=PETSC_NULL; current_space=PETSC_NULL;
265 
266   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
267   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
268 
269   ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr);
270 
271 
272   /* determine row ownership */
273   /*---------------------------------------------------------*/
274   ierr = PetscMapCreate(comm,&merge->rowmap);CHKERRQ(ierr);
275   ierr = PetscMapSetLocalSize(merge->rowmap,pn);CHKERRQ(ierr);
276   ierr = PetscMapSetType(merge->rowmap,MAP_MPI);CHKERRQ(ierr);
277   ierr = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
278 
279   /* determine the number of messages to send, their lengths */
280   /*---------------------------------------------------------*/
281   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr);
282   ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr);
283   len_s  = merge->len_s;
284   len = 0;  /* length of buf_si[] */
285   merge->nsend = 0;
286   for (proc=0; proc<size; proc++){
287     len_si[proc] = 0;
288     if (proc == rank){
289       len_s[proc] = 0;
290     } else {
291       len_si[proc] = owners[proc+1] - owners[proc] + 1;
292       len_s[proc] = ci[owners[proc+1]] - ci[owners[proc]]; /* num of rows to be sent to [proc] */
293     }
294     if (len_s[proc]) {
295       merge->nsend++;
296       nrows = 0;
297       for (i=owners[proc]; i<owners[proc+1]; i++){
298         if (ci[i+1] > ci[i]) nrows++;
299       }
300       len_si[proc] = 2*(nrows+1);
301       len += len_si[proc];
302     }
303   }
304 
305   /* determine the number and length of messages to receive for ij-structure */
306   /*-------------------------------------------------------------------------*/
307   ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr);
308   ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr);
309 
310   /* post the Irecv of j-structure */
311   /*-------------------------------*/
312   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagj);CHKERRQ(ierr);
313   ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rj_waits);CHKERRQ(ierr);
314 
315   /* post the Isend of j-structure */
316   /*--------------------------------*/
317   ierr = PetscMalloc((2*merge->nsend+1)*sizeof(MPI_Request),&si_waits);CHKERRQ(ierr);
318   sj_waits = si_waits + merge->nsend;
319 
320   for (proc=0, k=0; proc<size; proc++){
321     if (!len_s[proc]) continue;
322     i = owners[proc];
323     ierr = MPI_Isend(cj+ci[i],len_s[proc],MPIU_INT,proc,tagj,comm,sj_waits+k);CHKERRQ(ierr);
324     k++;
325   }
326 
327   /* receives and sends of j-structure are complete */
328   /*------------------------------------------------*/
329   ierr = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
330   ierr = MPI_Waitall(merge->nrecv,rj_waits,status);CHKERRQ(ierr);
331   ierr = MPI_Waitall(merge->nsend,sj_waits,status);CHKERRQ(ierr);
332 
333   /* send and recv i-structure */
334   /*---------------------------*/
335   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&tagi);CHKERRQ(ierr);
336   ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&ri_waits);CHKERRQ(ierr);
337 
338   ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr);
339   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
340   for (proc=0,k=0; proc<size; proc++){
341     if (!len_s[proc]) continue;
342     /* form outgoing message for i-structure:
343          buf_si[0]:                 nrows to be sent
344                [1:nrows]:           row index (global)
345                [nrows+1:2*nrows+1]: i-structure index
346     */
347     /*-------------------------------------------*/
348     nrows = len_si[proc]/2 - 1;
349     buf_si_i    = buf_si + nrows+1;
350     buf_si[0]   = nrows;
351     buf_si_i[0] = 0;
352     nrows = 0;
353     for (i=owners[proc]; i<owners[proc+1]; i++){
354       anzi = ci[i+1] - ci[i];
355       if (anzi) {
356         buf_si_i[nrows+1] = buf_si_i[nrows] + anzi; /* i-structure */
357         buf_si[nrows+1] = i-owners[proc]; /* local row index */
358         nrows++;
359       }
360     }
361     ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,si_waits+k);CHKERRQ(ierr);
362     k++;
363     buf_si += len_si[proc];
364   }
365 
366   ierr = MPI_Waitall(merge->nrecv,ri_waits,status);CHKERRQ(ierr);
367   ierr = MPI_Waitall(merge->nsend,si_waits,status);CHKERRQ(ierr);
368 
369   ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI: nsend: %d, nrecv: %d\n",merge->nsend,merge->nrecv);CHKERRQ(ierr);
370   for (i=0; i<merge->nrecv; i++){
371     ierr = PetscLogInfo((PetscObject)A,"MatMerge_SeqsToMPI:   recv len_ri=%d, len_rj=%d from [%d]\n",len_ri[i],merge->len_r[i],merge->id_r[i]);CHKERRQ(ierr);
372   }
373 
374   ierr = PetscFree(len_si);CHKERRQ(ierr);
375   ierr = PetscFree(len_ri);CHKERRQ(ierr);
376   ierr = PetscFree(rj_waits);CHKERRQ(ierr);
377   ierr = PetscFree(si_waits);CHKERRQ(ierr);
378   ierr = PetscFree(ri_waits);CHKERRQ(ierr);
379   ierr = PetscFree(buf_s);CHKERRQ(ierr);
380   ierr = PetscFree(status);CHKERRQ(ierr);
381 
382   /* compute a local seq matrix in each processor */
383   /*----------------------------------------------*/
384   /* allocate bi array and free space for accumulating nonzero column info */
385   ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
386   bi[0] = 0;
387 
388   /* create and initialize a linked list */
389   nlnk = pN+1;
390   ierr = PetscLLCreate(pN,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
391 
392   /* initial FreeSpace size is 2*(num of local nnz(C_seq)) */
393   len = 0;
394   len  = ci[owners[rank+1]] - ci[owners[rank]];
395   ierr = GetMoreSpace((PetscInt)(2*len+1),&free_space);CHKERRQ(ierr);
396   current_space = free_space;
397 
398   /* determine symbolic info for each local row */
399   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
400   nextrow = buf_ri_k + merge->nrecv;
401   nextci  = nextrow + merge->nrecv;
402   for (k=0; k<merge->nrecv; k++){
403     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
404     nrows = *buf_ri_k[k];
405     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
406     nextci[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
407   }
408 
409   ierr = MatPreallocateInitialize(comm,pn,pn,dnz,onz);CHKERRQ(ierr);
410   len = 0;
411   for (i=0; i<pn; i++) {
412     bnzi   = 0;
413     /* add local non-zero cols of this proc's C_seq into lnk */
414     arow   = owners[rank] + i;
415     anzi   = ci[arow+1] - ci[arow];
416     cji    = cj + ci[arow];
417     ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
418     bnzi += nlnk;
419     /* add received col data into lnk */
420     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
421       if (i == *nextrow[k]) { /* i-th row */
422         anzi = *(nextci[k]+1) - *nextci[k];
423         cji  = buf_rj[k] + *nextci[k];
424         ierr = PetscLLAdd(anzi,cji,pN,nlnk,lnk,lnkbt);CHKERRQ(ierr);
425         bnzi += nlnk;
426         nextrow[k]++; nextci[k]++;
427       }
428     }
429     if (len < bnzi) len = bnzi;  /* =max(bnzi) */
430 
431     /* if free space is not available, make more free space */
432     if (current_space->local_remaining<bnzi) {
433       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
434       nspacedouble++;
435     }
436     /* copy data into free space, then initialize lnk */
437     ierr = PetscLLClean(pN,pN,bnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr);
438     ierr = MatPreallocateSet(i+owners[rank],bnzi,current_space->array,dnz,onz);CHKERRQ(ierr);
439 
440     current_space->array           += bnzi;
441     current_space->local_used      += bnzi;
442     current_space->local_remaining -= bnzi;
443 
444     bi[i+1] = bi[i] + bnzi;
445   }
446 
447   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
448 
449   ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
450   ierr = MakeSpaceContiguous(&free_space,bj);CHKERRQ(ierr);
451   ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
452 
453   /* create symbolic parallel matrix B_mpi */
454   /*---------------------------------------*/
455   ierr = MatCreate(comm,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE,&B_mpi);CHKERRQ(ierr);
456   ierr = MatSetType(B_mpi,MATMPIAIJ);CHKERRQ(ierr);
457   ierr = MatMPIAIJSetPreallocation(B_mpi,0,dnz,0,onz);CHKERRQ(ierr);
458   ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr);
459   /* ierr = MatSetOption(B_mpi,MAT_COLUMNS_SORTED);CHKERRQ(ierr); -cause delay? */
460 
461   /* B_mpi is not ready for use - assembly will be done by MatMerge_SeqsToMPINumeric() */
462   B_mpi->assembled     = PETSC_FALSE;
463   B_mpi->ops->destroy  = MatDestroy_MPIAIJ_SeqsToMPI;
464   merge->bi            = bi;
465   merge->bj            = bj;
466   merge->ci            = ci;
467   merge->cj            = cj;
468   merge->buf_ri        = buf_ri;
469   merge->buf_rj        = buf_rj;
470   merge->C_seq         = PETSC_NULL;
471 
472   /* attach the supporting struct to B_mpi for reuse */
473   ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
474   ierr = PetscObjectContainerSetPointer(container,merge);CHKERRQ(ierr);
475   ierr = PetscObjectCompose((PetscObject)B_mpi,"MatMergeSeqsToMPI",(PetscObject)container);CHKERRQ(ierr);
476   *C = B_mpi;
477 
478   PetscFunctionReturn(0);
479 }
480 
481 #undef __FUNCT__
482 #define __FUNCT__ "MatPtAPNumeric_MPIAIJ_MPIAIJ"
483 PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
484 {
485   PetscErrorCode       ierr;
486   Mat_Merge_SeqsToMPI  *merge;
487   Mat_MatMatMultMPI    *ptap;
488   PetscObjectContainer cont_merge,cont_ptap;
489 
490   PetscInt       flops=0;
491   Mat_MPIAIJ     *a=(Mat_MPIAIJ*)A->data;
492   Mat_SeqAIJ     *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
493   Mat_SeqAIJ     *p_loc,*p_oth;
494   PetscInt       *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,*apj,*apjdense,cstart=a->cstart,cend=a->cend,col;
495   PetscInt       *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pJ,*pjj;
496   PetscInt       i,j,k,anzi,pnzi,apnzj,nextap,pnzj,prow,crow;
497   PetscInt       *cjj;
498   MatScalar      *ada=ad->a,*aoa=ao->a,*ada_tmp=ad->a,*aoa_tmp=ao->a,*apa,*paj,*cseqa,*caj;
499   MatScalar      *pa_loc,*pA,*pa_oth;
500   PetscInt       am=A->m,cN=C->N;
501   PetscInt       nextp,*adj_tmp=ad->j,*aoj_tmp=ao->j;
502 
503   MPI_Comm             comm=C->comm;
504   PetscMPIInt          size,rank,taga,*len_s;
505   PetscInt             *owners;
506   PetscInt             proc;
507   PetscInt             **buf_ri,**buf_rj;
508   PetscInt             cseqnzi,*bj_i,*bi,*bj,cseqrow,bnzi,nextcseqj; /* bi, bj, ba: for C (mpi mat) */
509   PetscInt             nrows,**buf_ri_k,**nextrow,**nextcseqi;
510   MPI_Request          *s_waits,*r_waits;
511   MPI_Status           *status;
512   MatScalar            **abuf_r,*ba_i;
513   PetscInt             *cseqi,*cseqj;
514   PetscInt             *cseqj_tmp;
515   MatScalar            *cseqa_tmp;
516   PetscInt             stages[3];
517   PetscInt             *apsymi,*apsymj;
518   FreeSpaceList        free_space=PETSC_NULL,current_space=PETSC_NULL;
519 
520   PetscFunctionBegin;
521   ierr = PetscLogStageRegister(&stages[0],"SymAP");CHKERRQ(ierr);
522   ierr = PetscLogStageRegister(&stages[1],"NumPtAP_local");CHKERRQ(ierr);
523   ierr = PetscLogStageRegister(&stages[2],"NumPtAP_Comm");CHKERRQ(ierr);
524 
525   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
526   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
527 
528   ierr = PetscObjectQuery((PetscObject)C,"MatMergeSeqsToMPI",(PetscObject *)&cont_merge);CHKERRQ(ierr);
529   if (cont_merge) {
530     ierr  = PetscObjectContainerGetPointer(cont_merge,(void **)&merge);CHKERRQ(ierr);
531   } else {
532     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix C does not posses an object container");
533   }
534   ierr = PetscObjectQuery((PetscObject)P,"MatPtAPMPI",(PetscObject *)&cont_ptap);CHKERRQ(ierr);
535   if (cont_ptap) {
536     ierr  = PetscObjectContainerGetPointer(cont_ptap,(void **)&ptap);CHKERRQ(ierr);
537   } else {
538     SETERRQ(PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not posses an object container");
539   }
540 
541   /* get data from symbolic products */
542   p_loc=(Mat_SeqAIJ*)(ptap->B_loc)->data;
543   p_oth=(Mat_SeqAIJ*)(ptap->B_oth)->data;
544   pi_loc=p_loc->i; pj_loc=p_loc->j; pJ=pj_loc; pa_loc=p_loc->a; pA=pa_loc;
545   pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a;
546 
547   cseqi = merge->ci; cseqj=merge->cj;
548   ierr  = PetscMalloc((cseqi[cN]+1)*sizeof(MatScalar),&cseqa);CHKERRQ(ierr);
549   ierr  = PetscMemzero(cseqa,cseqi[cN]*sizeof(MatScalar));CHKERRQ(ierr);
550 
551   /* ---------- Compute symbolic A*P --------- */
552   if (!ptap->abnz_max){
553     ierr = PetscLogStagePush(stages[0]);CHKERRQ(ierr);
554     ierr  = PetscMalloc((am+2)*sizeof(PetscInt),&apsymi);CHKERRQ(ierr);
555     ptap->abi = apsymi;
556     apsymi[0] = 0;
557     ierr = PetscMalloc(cN*(sizeof(MatScalar)+2*sizeof(PetscInt)),&apa);CHKERRQ(ierr);
558     ierr = PetscMemzero(apa,cN*(sizeof(MatScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
559     apj  = (PetscInt *)(apa + cN);
560     apjdense = apj + cN;
561     /* Initial FreeSpace size is 2*nnz(A) */
562     ierr = GetMoreSpace((PetscInt)(2*(adi[am]+aoi[am])),&free_space);CHKERRQ(ierr);
563     current_space = free_space;
564 
565   for (i=0;i<am;i++) {
566     apnzj = 0;
567     /* diagonal portion of A */
568     anzi  = adi[i+1] - adi[i];
569     for (j=0;j<anzi;j++) {
570       prow = *adj;
571       adj++;
572       pnzj = pi_loc[prow+1] - pi_loc[prow];
573       pjj  = pj_loc + pi_loc[prow];
574       paj  = pa_loc + pi_loc[prow];
575       for (k=0;k<pnzj;k++) {
576         if (!apjdense[pjj[k]]) {
577           apjdense[pjj[k]] = -1;
578           apj[apnzj++]     = pjj[k];
579         }
580       }
581       flops += 2*pnzj;
582       ada++;
583     }
584     /* off-diagonal portion of A */
585     anzi  = aoi[i+1] - aoi[i];
586     for (j=0;j<anzi;j++) {
587       col = a->garray[*aoj];
588       if (col < cstart){
589         prow = *aoj;
590       } else if (col >= cend){
591         prow = *aoj;
592       } else {
593         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
594       }
595       aoj++;
596       pnzj = pi_oth[prow+1] - pi_oth[prow];
597       pjj  = pj_oth + pi_oth[prow];
598       paj  = pa_oth + pi_oth[prow];
599       for (k=0;k<pnzj;k++) {
600         if (!apjdense[pjj[k]]) {
601           apjdense[pjj[k]] = -1;
602           apj[apnzj++]     = pjj[k];
603         }
604       }
605       flops += 2*pnzj;
606       aoa++;
607     }
608     /* Sort the j index array for quick sparse axpy. */
609     ierr = PetscSortInt(apnzj,apj);CHKERRQ(ierr);
610 
611     apsymi[i+1] = apsymi[i] + apnzj;
612     if (ptap->abnz_max < apnzj) ptap->abnz_max = apnzj;
613 
614     /* If free space is not available, make more free space */
615     /* Double the amount of total space in the list */
616     if (current_space->local_remaining<apnzj) {
617       ierr = GetMoreSpace(current_space->total_array_size,&current_space);CHKERRQ(ierr);
618     }
619 
620     /* Copy data into free space, then initialize lnk */
621     /* ierr = PetscLLClean(bn,bn,cnzi,lnk,current_space->array,lnkbt);CHKERRQ(ierr); */
622     for (j=0; j<apnzj; j++) current_space->array[j] = apj[j];
623     current_space->array           += apnzj;
624     current_space->local_used      += apnzj;
625     current_space->local_remaining -= apnzj;
626 
627     for (j=0;j<apnzj;j++) apjdense[apj[j]] = 0;
628   }
629   /* Allocate space for apsymj, initialize apsymj, and */
630   /* destroy list of free space and other temporary array(s) */
631   ierr = PetscMalloc((apsymi[am]+1)*sizeof(PetscInt),&ptap->abj);CHKERRQ(ierr);
632   ierr = MakeSpaceContiguous(&free_space,ptap->abj);CHKERRQ(ierr);
633   ierr = PetscLogStagePop();
634   }
635   /* -------endof Compute symbolic A*P ---------------------*/
636   else {
637     /* get data from symbolic A*P */
638     ierr = PetscMalloc((ptap->abnz_max+1)*sizeof(MatScalar),&apa);CHKERRQ(ierr);
639     ierr = PetscMemzero(apa,ptap->abnz_max*sizeof(MatScalar));CHKERRQ(ierr);
640   }
641 
642   ierr = PetscLogStagePush(stages[1]);CHKERRQ(ierr);
643   apsymi = ptap->abi; apsymj = ptap->abj;
644   for (i=0;i<am;i++) {
645     /* Form i-th sparse row of A*P */
646     apnzj = apsymi[i+1] - apsymi[i];
647     apj   = apsymj + apsymi[i];
648     /* diagonal portion of A */
649     anzi  = adi[i+1] - adi[i];
650     for (j=0;j<anzi;j++) {
651       prow = *adj_tmp;
652       adj_tmp++;
653       pnzj = pi_loc[prow+1] - pi_loc[prow];
654       pjj  = pj_loc + pi_loc[prow];
655       paj  = pa_loc + pi_loc[prow];
656       nextp = 0;
657       for (k=0; nextp<pnzj; k++) {
658         if (apj[k] == pjj[nextp]) { /* col of AP == col of P */
659           apa[k] += (*ada_tmp)*paj[nextp++];
660         }
661       }
662       ada_tmp++;
663     }
664     /* off-diagonal portion of A */
665     anzi  = aoi[i+1] - aoi[i];
666     for (j=0;j<anzi;j++) {
667       col = a->garray[*aoj_tmp];
668       if (col < cstart){
669         prow = *aoj_tmp;
670       } else if (col >= cend){
671         prow = *aoj_tmp;
672       } else {
673         SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Off-diagonal portion of A has wrong column map");
674       }
675       aoj_tmp++;
676       pnzj = pi_oth[prow+1] - pi_oth[prow];
677       pjj  = pj_oth + pi_oth[prow];
678       paj  = pa_oth + pi_oth[prow];
679       nextp = 0;
680       for (k=0; nextp<pnzj; k++) {
681         if (apj[k] == pjj[nextp]) { /* col of AP == col of P */
682           apa[k] += (*aoa_tmp)*paj[nextp++];
683         }
684       }
685       flops += 2*pnzj;
686       aoa_tmp++;
687     }
688 
689     /* Compute P_loc[i,:]^T*AP[i,:] using outer product */
690     pnzi = pi_loc[i+1] - pi_loc[i];
691     for (j=0;j<pnzi;j++) {
692       nextap = 0;
693       crow   = *pJ++;
694       cjj    = cseqj + cseqi[crow];
695       caj    = cseqa + cseqi[crow];
696       /* Perform sparse axpy operation.  Note cjj includes apj. */
697       for (k=0;nextap<apnzj;k++) {
698         if (cjj[k]==apj[nextap]) caj[k] += (*pA)*apa[nextap++];
699       }
700       flops += 2*apnzj;
701       pA++;
702     }
703     /* zero the current row info for A*P */
704     ierr = PetscMemzero(apa,apnzj*sizeof(MatScalar));CHKERRQ(ierr);
705   }
706 
707   ierr = PetscFree(apa);CHKERRQ(ierr);
708   ierr = PetscLogStagePop();
709 
710   bi     = merge->bi;
711   bj     = merge->bj;
712   buf_ri = merge->buf_ri;
713   buf_rj = merge->buf_rj;
714 
715   ierr   = PetscMalloc(size*sizeof(MPI_Status),&status);CHKERRQ(ierr);
716   ierr   = PetscMapGetGlobalRange(merge->rowmap,&owners);CHKERRQ(ierr);
717 
718   /* send and recv matrix values */
719   /*-----------------------------*/
720   ierr = PetscLogStagePush(stages[2]);CHKERRQ(ierr);
721   len_s  = merge->len_s;
722   ierr = PetscObjectGetNewTag((PetscObject)merge->rowmap,&taga);CHKERRQ(ierr);
723   ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr);
724 
725   ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&s_waits);CHKERRQ(ierr);
726   for (proc=0,k=0; proc<size; proc++){
727     if (!len_s[proc]) continue;
728     i = owners[proc];
729     ierr = MPI_Isend(cseqa+cseqi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr);
730     k++;
731   }
732 
733   ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);
734   ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);
735   ierr = PetscFree(status);CHKERRQ(ierr);
736 
737   ierr = PetscFree(s_waits);CHKERRQ(ierr);
738   ierr = PetscFree(r_waits);CHKERRQ(ierr);
739 
740   /* insert mat values of mpimat */
741   /*----------------------------*/
742   ierr = PetscMalloc(cN*sizeof(MatScalar),&ba_i);CHKERRQ(ierr);
743   ierr = PetscMalloc((3*merge->nrecv+1)*sizeof(PetscInt**),&buf_ri_k);CHKERRQ(ierr);
744   nextrow = buf_ri_k + merge->nrecv;
745   nextcseqi  = nextrow + merge->nrecv;
746 
747   for (k=0; k<merge->nrecv; k++){
748     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
749     nrows = *(buf_ri_k[k]);
750     nextrow[k]  = buf_ri_k[k]+1;  /* next row number of k-th recved i-structure */
751     nextcseqi[k]   = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure  */
752   }
753 
754   /* set values of ba */
755   for (i=0; i<C->m; i++) {
756     cseqrow = owners[rank] + i; /* global row index of C_seq */
757     bj_i = bj+bi[i];  /* col indices of the i-th row of C */
758     bnzi = bi[i+1] - bi[i];
759     ierr = PetscMemzero(ba_i,bnzi*sizeof(MatScalar));CHKERRQ(ierr);
760 
761     /* add local non-zero vals of this proc's C_seq into ba */
762     cseqnzi = cseqi[cseqrow+1] - cseqi[cseqrow];
763     cseqj_tmp = cseqj + cseqi[cseqrow];
764     cseqa_tmp = cseqa + cseqi[cseqrow];
765     nextcseqj = 0;
766     for (j=0; nextcseqj<cseqnzi; j++){
767       if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
768         ba_i[j] += cseqa_tmp[nextcseqj++];
769       }
770     }
771 
772     /* add received vals into ba */
773     for (k=0; k<merge->nrecv; k++){ /* k-th received message */
774       /* i-th row */
775       if (i == *nextrow[k]) {
776         cseqnzi   = *(nextcseqi[k]+1) - *nextcseqi[k];
777         cseqj_tmp = buf_rj[k] + *(nextcseqi[k]);
778         cseqa_tmp = abuf_r[k] + *(nextcseqi[k]);
779         nextcseqj = 0;
780         for (j=0; nextcseqj<cseqnzi; j++){
781           if (*(bj_i + j) == cseqj_tmp[nextcseqj]){ /* bcol == cseqcol */
782             ba_i[j] += cseqa_tmp[nextcseqj++];
783           }
784         }
785         nextrow[k]++; nextcseqi[k]++;
786       }
787     }
788     ierr = MatSetValues(C,1,&cseqrow,bnzi,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr);
789     flops += 2*bnzi;
790   }
791   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
792   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
793   ierr = PetscLogStagePop();CHKERRQ(ierr);
794 
795   ierr = PetscFree(cseqa);CHKERRQ(ierr);
796   ierr = PetscFree(abuf_r);CHKERRQ(ierr);
797   ierr = PetscFree(ba_i);CHKERRQ(ierr);
798   ierr = PetscFree(buf_ri_k);CHKERRQ(ierr);
799   ierr = PetscLogFlops(flops);CHKERRQ(ierr);
800 
801   PetscFunctionReturn(0);
802 }
803