xref: /petsc/src/ksp/pc/impls/redundant/redundant.c (revision 8ee2e534e188dd1b8ca1bcd65428d4df7139f0a3)
1 #define PETSCKSP_DLL
2 
3 /*
4   This file defines a "solve the problem redundantly on each subgroup of processor" preconditioner.
5 */
6 #include "private/pcimpl.h"     /*I "petscpc.h" I*/
7 #include "petscksp.h"
8 
9 typedef struct {
10   PC         pc;                   /* actual preconditioner used on each processor */
11   Vec        xsub,ysub;            /* vectors of a subcommunicator to hold parallel vectors of pc->comm */
12   Vec        xdup,ydup;            /* parallel vector that congregates xsub or ysub facilitating vector scattering */
13   Mat        pmats;                /* matrix and optional preconditioner matrix belong to a subcommunicator */
14   VecScatter scatterin,scatterout; /* scatter used to move all values to each processor group (subcommunicator) */
15   PetscTruth useparallelmat;
16   MPI_Comm   subcomm;              /* processors belong to a subcommunicator implement a PC in parallel */
17   MPI_Comm   dupcomm;              /* processors belong to pc->comm with their rank remapped in the way
18                                       that vector xdup/ydup has contiguous rank while appending xsub/ysub along their colors */
19   PetscInt   nsubcomm;             /* num of subcommunicators, which equals the num of redundant matrix systems */
20   PetscInt   color;                /* color of processors in a subcommunicator */
21 } PC_Redundant;
22 
23 #undef __FUNCT__
24 #define __FUNCT__ "PCView_Redundant"
25 static PetscErrorCode PCView_Redundant(PC pc,PetscViewer viewer)
26 {
27   PC_Redundant   *red = (PC_Redundant*)pc->data;
28   PetscErrorCode ierr;
29   PetscMPIInt    rank;
30   PetscTruth     iascii,isstring;
31   PetscViewer    sviewer;
32 
33   PetscFunctionBegin;
34   ierr = MPI_Comm_rank(pc->comm,&rank);CHKERRQ(ierr);
35   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&iascii);CHKERRQ(ierr);
36   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_STRING,&isstring);CHKERRQ(ierr);
37   if (iascii) {
38     ierr = PetscViewerASCIIPrintf(viewer,"  Redundant solver preconditioner: Actual PC follows\n");CHKERRQ(ierr);
39     ierr = PetscViewerGetSingleton(viewer,&sviewer);CHKERRQ(ierr);
40     if (!rank) {
41       ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
42       ierr = PCView(red->pc,sviewer);CHKERRQ(ierr);
43       ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
44     }
45     ierr = PetscViewerRestoreSingleton(viewer,&sviewer);CHKERRQ(ierr);
46   } else if (isstring) {
47     ierr = PetscViewerStringSPrintf(viewer," Redundant solver preconditioner");CHKERRQ(ierr);
48     ierr = PetscViewerGetSingleton(viewer,&sviewer);CHKERRQ(ierr);
49     if (!rank) {
50       ierr = PCView(red->pc,sviewer);CHKERRQ(ierr);
51     }
52     ierr = PetscViewerRestoreSingleton(viewer,&sviewer);CHKERRQ(ierr);
53   } else {
54     SETERRQ1(PETSC_ERR_SUP,"Viewer type %s not supported for PC redundant",((PetscObject)viewer)->type_name);
55   }
56   PetscFunctionReturn(0);
57 }
58 
59 #include "src/mat/matimpl.h"        /*I "petscmat.h" I*/
60 #include "private/vecimpl.h"
61 #include "src/mat/impls/aij/mpi/mpiaij.h"   /*I "petscmat.h" I*/
62 #include "src/mat/impls/aij/seq/aij.h"      /*I "petscmat.h" I*/
63 
64 typedef struct { /* used by MatGetRedundantMatrix() for reusing matredundant */
65   PetscInt       nzlocal,nsends,nrecvs;
66   PetscInt       *send_rank,*sbuf_nz,*sbuf_j,**rbuf_j;
67   PetscScalar    *sbuf_a,**rbuf_a;
68   PetscErrorCode (*MatDestroy)(Mat);
69 } Mat_Redundant;
70 
71 #undef __FUNCT__
72 #define __FUNCT__ "PetscObjectContainerDestroy_MatRedundant"
73 PetscErrorCode PetscObjectContainerDestroy_MatRedundant(void *ptr)
74 {
75   PetscErrorCode       ierr;
76   Mat_Redundant        *redund=(Mat_Redundant*)ptr;
77   PetscInt             i;
78 
79   PetscFunctionBegin;
80   ierr = PetscFree(redund->send_rank);CHKERRQ(ierr);
81   ierr = PetscFree(redund->sbuf_j);CHKERRQ(ierr);
82   ierr = PetscFree(redund->sbuf_a);CHKERRQ(ierr);
83   for (i=0; i<redund->nrecvs; i++){
84     ierr = PetscFree(redund->rbuf_j[i]);CHKERRQ(ierr);
85     ierr = PetscFree(redund->rbuf_a[i]);CHKERRQ(ierr);
86   }
87   ierr = PetscFree3(redund->sbuf_nz,redund->rbuf_j,redund->rbuf_a);CHKERRQ(ierr);
88   ierr = PetscFree(redund);CHKERRQ(ierr);
89   PetscFunctionReturn(0);
90 }
91 
92 #undef __FUNCT__
93 #define __FUNCT__ "MatDestroy_MatRedundant"
94 PetscErrorCode MatDestroy_MatRedundant(Mat A)
95 {
96   PetscErrorCode       ierr;
97   PetscObjectContainer container;
98   Mat_Redundant        *redund=PETSC_NULL;
99 
100   PetscFunctionBegin;
101   ierr = PetscObjectQuery((PetscObject)A,"Mat_Redundant",(PetscObject *)&container);CHKERRQ(ierr);
102   if (container) {
103     ierr = PetscObjectContainerGetPointer(container,(void **)&redund);CHKERRQ(ierr);
104   } else {
105     SETERRQ(PETSC_ERR_PLIB,"Container does not exit");
106   }
107   A->ops->destroy = redund->MatDestroy;
108   ierr = PetscObjectCompose((PetscObject)A,"Mat_Redundant",0);CHKERRQ(ierr);
109   ierr = (*A->ops->destroy)(A);CHKERRQ(ierr);
110   ierr = PetscObjectContainerDestroy(container);CHKERRQ(ierr);
111   PetscFunctionReturn(0);
112 }
113 
114 #undef __FUNCT__
115 #define __FUNCT__ "MatGetRedundantMatrix"
116 PetscErrorCode MatGetRedundantMatrix_AIJ(Mat mat,PetscInt nsubcomm,MPI_Comm subcomm,PetscInt mlocal_sub,MatReuse reuse,Mat *matredundant)
117 {
118   PetscMPIInt    rank,size;
119   MPI_Comm       comm=mat->comm;
120   PetscErrorCode ierr;
121   PetscInt       nsends,nrecvs,i,prid=100,rownz_max;
122   PetscMPIInt    *send_rank,*recv_rank;
123   PetscInt       *rowrange=mat->rmap.range;
124   Mat_MPIAIJ     *aij = (Mat_MPIAIJ*)mat->data;
125   Mat            A=aij->A,B=aij->B,C=*matredundant;
126   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data;
127   PetscScalar    *sbuf_a;
128   PetscInt       nzlocal=a->nz+b->nz;
129   PetscInt       j,cstart=mat->cmap.rstart,cend=mat->cmap.rend,row,nzA,nzB,ncols,*cworkA,*cworkB;
130   PetscInt       rstart=mat->rmap.rstart,rend=mat->rmap.rend,*bmap=aij->garray,M,N;
131   PetscInt       *cols,ctmp,lwrite,*rptr,l,*sbuf_j;
132   PetscScalar    *vals,*aworkA,*aworkB;
133   PetscMPIInt    tag1,tag2,tag3,imdex;
134   MPI_Request    *s_waits1,*s_waits2,*s_waits3,*r_waits1,*r_waits2,*r_waits3;
135   MPI_Status     recv_status,*send_status;
136   PetscInt       *sbuf_nz,*rbuf_nz,count;
137   PetscInt       **rbuf_j;
138   PetscScalar    **rbuf_a;
139   Mat_Redundant  *redund=PETSC_NULL;
140   PetscObjectContainer container;
141 
142   PetscFunctionBegin;
143   ierr = PetscOptionsGetInt(PETSC_NULL,"-prid",&prid,PETSC_NULL);CHKERRQ(ierr);
144   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
145   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
146 
147   if (reuse == MAT_REUSE_MATRIX) {
148     ierr = MatGetSize(C,&M,&N);CHKERRQ(ierr);
149     if (M != N || M != mat->rmap.N) SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. Wrong global size");
150     ierr = MatGetLocalSize(C,&M,&N);CHKERRQ(ierr);
151     if (M != N || M != mlocal_sub) SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. Wrong local size");
152     ierr = PetscObjectQuery((PetscObject)C,"Mat_Redundant",(PetscObject *)&container);CHKERRQ(ierr);
153     if (container) {
154       ierr = PetscObjectContainerGetPointer(container,(void **)&redund);CHKERRQ(ierr);
155     } else {
156       SETERRQ(PETSC_ERR_PLIB,"Container does not exit");
157     }
158     if (nzlocal != redund->nzlocal) SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. Wrong nzlocal");
159 
160     nsends    = redund->nsends;
161     nrecvs    = redund->nrecvs;
162     send_rank = redund->send_rank; recv_rank = send_rank + size;
163     sbuf_nz   = redund->sbuf_nz;     rbuf_nz = sbuf_nz + nsends;
164     sbuf_j    = redund->sbuf_j;
165     sbuf_a    = redund->sbuf_a;
166     rbuf_j    = redund->rbuf_j;
167     rbuf_a    = redund->rbuf_a;
168   }
169 
170   if (reuse == MAT_INITIAL_MATRIX){
171     PetscMPIInt  subrank,subsize;
172     PetscInt     nleftover,np_subcomm;
173     /* get the destination processors' id send_rank, nsends and nrecvs */
174     ierr = MPI_Comm_rank(subcomm,&subrank);CHKERRQ(ierr);
175     ierr = MPI_Comm_size(subcomm,&subsize);CHKERRQ(ierr);
176     ierr = PetscMalloc((2*size+1)*sizeof(PetscMPIInt),&send_rank);
177     recv_rank = send_rank + size;
178     np_subcomm = size/nsubcomm;
179     nleftover  = size - nsubcomm*np_subcomm;
180     nsends = 0; nrecvs = 0;
181     for (i=0; i<size; i++){ /* i=rank*/
182       if (subrank == i/nsubcomm && rank != i){ /* my_subrank == other's subrank */
183         send_rank[nsends] = i; nsends++;
184         recv_rank[nrecvs++] = i;
185       }
186     }
187     if (rank >= size - nleftover){/* this proc is a leftover processor */
188       i = size-nleftover-1;
189       j = 0;
190       while (j < nsubcomm - nleftover){
191         send_rank[nsends++] = i;
192         i--; j++;
193       }
194     }
195 
196     if (nleftover && subsize == size/nsubcomm && subrank==subsize-1){ /* this proc recvs from leftover processors */
197       for (i=0; i<nleftover; i++){
198         recv_rank[nrecvs++] = size-nleftover+i;
199       }
200     }
201 
202     /* allocate sbuf_j, sbuf_a */
203     i = nzlocal + rowrange[rank+1] - rowrange[rank] + 2;
204     ierr = PetscMalloc(i*sizeof(PetscInt),&sbuf_j);CHKERRQ(ierr);
205     ierr = PetscMalloc((nzlocal+1)*sizeof(PetscScalar),&sbuf_a);CHKERRQ(ierr);
206   } /* endof if (reuse == MAT_INITIAL_MATRIX) */
207 
208   /* copy mat's local entries into the buffers */
209   if (reuse == MAT_INITIAL_MATRIX){
210     rownz_max = 0;
211     rptr = sbuf_j;
212     cols = sbuf_j + rend-rstart + 1;
213     vals = sbuf_a;
214     rptr[0] = 0;
215     for (i=0; i<rend-rstart; i++){
216       row = i + rstart;
217       nzA    = a->i[i+1] - a->i[i]; nzB = b->i[i+1] - b->i[i];
218       ncols  = nzA + nzB;
219       cworkA = a->j + a->i[i]; cworkB = b->j + b->i[i];
220       aworkA = a->a + a->i[i]; aworkB = b->a + b->i[i];
221       /* load the column indices for this row into cols */
222       lwrite = 0;
223       for (l=0; l<nzB; l++) {
224         if ((ctmp = bmap[cworkB[l]]) < cstart){
225           vals[lwrite]   = aworkB[l];
226           cols[lwrite++] = ctmp;
227         }
228       }
229       for (l=0; l<nzA; l++){
230         vals[lwrite]   = aworkA[l];
231         cols[lwrite++] = cstart + cworkA[l];
232       }
233       for (l=0; l<nzB; l++) {
234         if ((ctmp = bmap[cworkB[l]]) >= cend){
235           vals[lwrite]   = aworkB[l];
236           cols[lwrite++] = ctmp;
237         }
238       }
239       vals += ncols;
240       cols += ncols;
241       rptr[i+1] = rptr[i] + ncols;
242       if (rownz_max < ncols) rownz_max = ncols;
243     }
244     if (rptr[rend-rstart] != a->nz + b->nz) SETERRQ4(1, "rptr[%d] %d != %d + %d",rend-rstart,rptr[rend-rstart+1],a->nz,b->nz);
245   } else { /* only copy matrix values into sbuf_a */
246     rptr = sbuf_j;
247     vals = sbuf_a;
248     rptr[0] = 0;
249     for (i=0; i<rend-rstart; i++){
250       row = i + rstart;
251       nzA    = a->i[i+1] - a->i[i]; nzB = b->i[i+1] - b->i[i];
252       ncols  = nzA + nzB;
253       cworkA = a->j + a->i[i]; cworkB = b->j + b->i[i];
254       aworkA = a->a + a->i[i]; aworkB = b->a + b->i[i];
255       lwrite = 0;
256       for (l=0; l<nzB; l++) {
257         if ((ctmp = bmap[cworkB[l]]) < cstart) vals[lwrite++] = aworkB[l];
258       }
259       for (l=0; l<nzA; l++) vals[lwrite++] = aworkA[l];
260       for (l=0; l<nzB; l++) {
261         if ((ctmp = bmap[cworkB[l]]) >= cend) vals[lwrite++] = aworkB[l];
262       }
263       vals += ncols;
264       rptr[i+1] = rptr[i] + ncols;
265     }
266   } /* endof if (reuse == MAT_INITIAL_MATRIX) */
267 
268   /* send nzlocal to others, and recv other's nzlocal */
269   /*--------------------------------------------------*/
270   if (reuse == MAT_INITIAL_MATRIX){
271     ierr = PetscMalloc2(3*(nsends + nrecvs)+1,MPI_Request,&s_waits3,nsends+1,MPI_Status,&send_status);CHKERRQ(ierr);
272     s_waits2 = s_waits3 + nsends;
273     s_waits1 = s_waits2 + nsends;
274     r_waits1 = s_waits1 + nsends;
275     r_waits2 = r_waits1 + nrecvs;
276     r_waits3 = r_waits2 + nrecvs;
277   } else {
278     ierr = PetscMalloc2(nsends + nrecvs +1,MPI_Request,&s_waits3,nsends+1,MPI_Status,&send_status);CHKERRQ(ierr);
279     r_waits3 = s_waits3 + nsends;
280   }
281 
282   ierr = PetscObjectGetNewTag((PetscObject)mat,&tag3);CHKERRQ(ierr);
283   if (reuse == MAT_INITIAL_MATRIX){
284     /* get new tags to keep the communication clean */
285     ierr = PetscObjectGetNewTag((PetscObject)mat,&tag1);CHKERRQ(ierr);
286     ierr = PetscObjectGetNewTag((PetscObject)mat,&tag2);CHKERRQ(ierr);
287     ierr = PetscMalloc3(nsends+nrecvs+1,PetscInt,&sbuf_nz,nrecvs,PetscInt*,&rbuf_j,nrecvs,PetscScalar*,&rbuf_a);CHKERRQ(ierr);
288     rbuf_nz = sbuf_nz + nsends;
289 
290     /* post receives of other's nzlocal */
291     for (i=0; i<nrecvs; i++){
292       ierr = MPI_Irecv(rbuf_nz+i,1,MPIU_INT,MPI_ANY_SOURCE,tag1,comm,r_waits1+i);CHKERRQ(ierr);
293     }
294     /* send nzlocal to others */
295     for (i=0; i<nsends; i++){
296       sbuf_nz[i] = nzlocal;
297       ierr = MPI_Isend(sbuf_nz+i,1,MPIU_INT,send_rank[i],tag1,comm,s_waits1+i);CHKERRQ(ierr);
298     }
299     /* wait on receives of nzlocal; allocate space for rbuf_j, rbuf_a */
300     count = nrecvs;
301     while (count) {
302       ierr = MPI_Waitany(nrecvs,r_waits1,&imdex,&recv_status);CHKERRQ(ierr);
303       recv_rank[imdex] = recv_status.MPI_SOURCE;
304       /* allocate rbuf_a and rbuf_j; then post receives of rbuf_j */
305       ierr = PetscMalloc((rbuf_nz[imdex]+1)*sizeof(PetscScalar),&rbuf_a[imdex]);CHKERRQ(ierr);
306 
307       i = rowrange[recv_status.MPI_SOURCE+1] - rowrange[recv_status.MPI_SOURCE]; /* number of expected mat->i */
308       rbuf_nz[imdex] += i + 2;
309       ierr = PetscMalloc(rbuf_nz[imdex]*sizeof(PetscInt),&rbuf_j[imdex]);CHKERRQ(ierr);
310       ierr = MPI_Irecv(rbuf_j[imdex],rbuf_nz[imdex],MPIU_INT,recv_status.MPI_SOURCE,tag2,comm,r_waits2+imdex);CHKERRQ(ierr);
311       count--;
312     }
313     /* wait on sends of nzlocal */
314     if (nsends) {ierr = MPI_Waitall(nsends,s_waits1,send_status);CHKERRQ(ierr);}
315     /* send mat->i,j to others, and recv from other's */
316     /*------------------------------------------------*/
317     for (i=0; i<nsends; i++){
318       j = nzlocal + rowrange[rank+1] - rowrange[rank] + 1;
319       ierr = MPI_Isend(sbuf_j,j,MPIU_INT,send_rank[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
320     }
321     /* wait on receives of mat->i,j */
322     /*------------------------------*/
323     count = nrecvs;
324     while (count) {
325       ierr = MPI_Waitany(nrecvs,r_waits2,&imdex,&recv_status);CHKERRQ(ierr);
326       if (recv_rank[imdex] != recv_status.MPI_SOURCE) SETERRQ2(1, "recv_rank %d != MPI_SOURCE %d",recv_rank[imdex],recv_status.MPI_SOURCE);
327       count--;
328     }
329     /* wait on sends of mat->i,j */
330     /*---------------------------*/
331     if (nsends) {
332       ierr = MPI_Waitall(nsends,s_waits2,send_status);CHKERRQ(ierr);
333     }
334   } /* endof if (reuse == MAT_INITIAL_MATRIX) */
335 
336   /* post receives, send and receive mat->a */
337   /*----------------------------------------*/
338   for (imdex=0; imdex<nrecvs; imdex++) {
339     ierr = MPI_Irecv(rbuf_a[imdex],rbuf_nz[imdex],MPIU_SCALAR,recv_rank[imdex],tag3,comm,r_waits3+imdex);CHKERRQ(ierr);
340   }
341   for (i=0; i<nsends; i++){
342     ierr = MPI_Isend(sbuf_a,nzlocal,MPIU_SCALAR,send_rank[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
343   }
344   count = nrecvs;
345   while (count) {
346     ierr = MPI_Waitany(nrecvs,r_waits3,&imdex,&recv_status);CHKERRQ(ierr);
347     if (recv_rank[imdex] != recv_status.MPI_SOURCE) SETERRQ2(1, "recv_rank %d != MPI_SOURCE %d",recv_rank[imdex],recv_status.MPI_SOURCE);
348     count--;
349   }
350   if (nsends) {
351     ierr = MPI_Waitall(nsends,s_waits3,send_status);CHKERRQ(ierr);
352   }
353 
354   ierr = PetscFree2(s_waits3,send_status);CHKERRQ(ierr);
355 
356   /* create redundant matrix */
357   /*-------------------------*/
358   if (reuse == MAT_INITIAL_MATRIX){
359     /* compute rownz_max for preallocation */
360     for (imdex=0; imdex<nrecvs; imdex++){
361       j = rowrange[recv_rank[imdex]+1] - rowrange[recv_rank[imdex]];
362       rptr = rbuf_j[imdex];
363       for (i=0; i<j; i++){
364         ncols = rptr[i+1] - rptr[i];
365         if (rownz_max < ncols) rownz_max = ncols;
366       }
367     }
368 
369     ierr = MatCreate(subcomm,&C);CHKERRQ(ierr);
370     ierr = MatSetSizes(C,mlocal_sub,mlocal_sub,PETSC_DECIDE,PETSC_DECIDE);CHKERRQ(ierr);
371     ierr = MatSetFromOptions(C);CHKERRQ(ierr);
372     ierr = MatSeqAIJSetPreallocation(C,rownz_max,PETSC_NULL);CHKERRQ(ierr);
373     ierr = MatMPIAIJSetPreallocation(C,rownz_max,PETSC_NULL,rownz_max,PETSC_NULL);CHKERRQ(ierr);
374   } else {
375     C = *matredundant;
376   }
377 
378   /* insert local matrix entries */
379   rptr = sbuf_j;
380   cols = sbuf_j + rend-rstart + 1;
381   vals = sbuf_a;
382   for (i=0; i<rend-rstart; i++){
383     row   = i + rstart;
384     ncols = rptr[i+1] - rptr[i];
385     ierr = MatSetValues(C,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
386     vals += ncols;
387     cols += ncols;
388   }
389   /* insert received matrix entries */
390   for (imdex=0; imdex<nrecvs; imdex++){
391     rstart = rowrange[recv_rank[imdex]];
392     rend   = rowrange[recv_rank[imdex]+1];
393     rptr = rbuf_j[imdex];
394     cols = rbuf_j[imdex] + rend-rstart + 1;
395     vals = rbuf_a[imdex];
396     for (i=0; i<rend-rstart; i++){
397       row   = i + rstart;
398       ncols = rptr[i+1] - rptr[i];
399       ierr = MatSetValues(C,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
400       vals += ncols;
401       cols += ncols;
402     }
403   }
404   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
405   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
406   ierr = MatGetSize(C,&M,&N);CHKERRQ(ierr);
407   if (M != mat->rmap.N || N != mat->cmap.N) SETERRQ2(PETSC_ERR_ARG_INCOMP,"redundant mat size %d != input mat size %d",M,mat->rmap.N);
408   if (reuse == MAT_INITIAL_MATRIX){
409     PetscObjectContainer container;
410     *matredundant = C;
411     /* create a supporting struct and attach it to C for reuse */
412     ierr = PetscNew(Mat_Redundant,&redund);CHKERRQ(ierr);
413     ierr = PetscObjectContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
414     ierr = PetscObjectContainerSetPointer(container,redund);CHKERRQ(ierr);
415     ierr = PetscObjectCompose((PetscObject)C,"Mat_Redundant",(PetscObject)container);CHKERRQ(ierr);
416     ierr = PetscObjectContainerSetUserDestroy(container,PetscObjectContainerDestroy_MatRedundant);CHKERRQ(ierr);
417 
418     redund->nzlocal = nzlocal;
419     redund->nsends  = nsends;
420     redund->nrecvs  = nrecvs;
421     redund->send_rank = send_rank;
422     redund->sbuf_nz = sbuf_nz;
423     redund->sbuf_j  = sbuf_j;
424     redund->sbuf_a  = sbuf_a;
425     redund->rbuf_j  = rbuf_j;
426     redund->rbuf_a  = rbuf_a;
427 
428     redund->MatDestroy = C->ops->destroy;
429     C->ops->destroy    = MatDestroy_MatRedundant;
430   }
431   PetscFunctionReturn(0);
432 }
433 
434 #undef __FUNCT__
435 #define __FUNCT__ "PCSetUp_Redundant"
436 static PetscErrorCode PCSetUp_Redundant(PC pc)
437 {
438   PC_Redundant   *red  = (PC_Redundant*)pc->data;
439   PetscErrorCode ierr;
440   PetscInt       mstart,mend,mlocal,m;
441   PetscMPIInt    size;
442   MatReuse       reuse = MAT_INITIAL_MATRIX;
443   MatStructure   str   = DIFFERENT_NONZERO_PATTERN;
444   MPI_Comm       comm;
445   Vec            vec;
446 
447   PetscInt       mlocal_sub;
448   PetscMPIInt    subsize,subrank;
449   PetscInt       rstart_sub,rend_sub,mloc_sub;
450 
451   PetscFunctionBegin;
452   ierr = PetscObjectGetComm((PetscObject)pc->pmat,&comm);CHKERRQ(ierr);
453   ierr = MatGetVecs(pc->pmat,&vec,0);CHKERRQ(ierr);
454   ierr = PCSetFromOptions(red->pc);CHKERRQ(ierr);
455   ierr = VecGetSize(vec,&m);CHKERRQ(ierr);
456   if (!pc->setupcalled) {
457     /* create working vectors xsub/ysub and xdup/ydup */
458     ierr = VecGetLocalSize(vec,&mlocal);CHKERRQ(ierr);
459     ierr = VecGetOwnershipRange(vec,&mstart,&mend);CHKERRQ(ierr);
460 
461     /* get local size of xsub/ysub */
462     ierr = MPI_Comm_size(red->subcomm,&subsize);CHKERRQ(ierr);
463     ierr = MPI_Comm_rank(red->subcomm,&subrank);CHKERRQ(ierr);
464     rstart_sub = pc->pmat->rmap.range[red->nsubcomm*subrank]; /* rstart in xsub/ysub */
465     if (subrank+1 < subsize){
466       rend_sub = pc->pmat->rmap.range[red->nsubcomm*(subrank+1)];
467     } else {
468       rend_sub = m;
469     }
470     mloc_sub = rend_sub - rstart_sub;
471     ierr = VecCreateMPI(red->subcomm,mloc_sub,PETSC_DECIDE,&red->ysub);CHKERRQ(ierr);
472     /* create xsub with empty local arrays, because xdup's arrays will be placed into it */
473     ierr = VecCreateMPIWithArray(red->subcomm,mloc_sub,PETSC_DECIDE,PETSC_NULL,&red->xsub);CHKERRQ(ierr);
474 
475     /* create xdup and ydup. ydup has empty local arrays because ysub's arrays will be place into it.
476        Note: we use communicator dupcomm, not pc->comm! */
477     ierr = VecCreateMPI(red->dupcomm,mloc_sub,PETSC_DECIDE,&red->xdup);CHKERRQ(ierr);
478     ierr = VecCreateMPIWithArray(red->dupcomm,mloc_sub,PETSC_DECIDE,PETSC_NULL,&red->ydup);CHKERRQ(ierr);
479 
480     /* create vec scatters */
481     if (!red->scatterin){
482       IS       is1,is2;
483       PetscInt *idx1,*idx2,i,j,k;
484       ierr = PetscMalloc(2*red->nsubcomm*mlocal*sizeof(PetscInt),&idx1);CHKERRQ(ierr);
485       idx2 = idx1 + red->nsubcomm*mlocal;
486       j = 0;
487       for (k=0; k<red->nsubcomm; k++){
488         for (i=mstart; i<mend; i++){
489           idx1[j]   = i;
490           idx2[j++] = i + m*k;
491         }
492       }
493       ierr = ISCreateGeneral(comm,red->nsubcomm*mlocal,idx1,&is1);CHKERRQ(ierr);
494       ierr = ISCreateGeneral(comm,red->nsubcomm*mlocal,idx2,&is2);CHKERRQ(ierr);
495       ierr = VecScatterCreate(vec,is1,red->xdup,is2,&red->scatterin);CHKERRQ(ierr);
496       ierr = ISDestroy(is1);CHKERRQ(ierr);
497       ierr = ISDestroy(is2);CHKERRQ(ierr);
498 
499       ierr = ISCreateStride(comm,mlocal,mstart+ red->color*m,1,&is1);CHKERRQ(ierr);
500       ierr = ISCreateStride(comm,mlocal,mstart,1,&is2);CHKERRQ(ierr);
501       ierr = VecScatterCreate(red->xdup,is1,vec,is2,&red->scatterout);CHKERRQ(ierr);
502       ierr = ISDestroy(is1);CHKERRQ(ierr);
503       ierr = ISDestroy(is2);CHKERRQ(ierr);
504       ierr = PetscFree(idx1);CHKERRQ(ierr);
505     }
506   }
507   ierr = VecDestroy(vec);CHKERRQ(ierr);
508 
509   /* if pmatrix set by user is sequential then we do not need to gather the parallel matrix */
510   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
511   if (size == 1) {
512     red->useparallelmat = PETSC_FALSE;
513   }
514 
515   if (red->useparallelmat) {
516     if (pc->setupcalled == 1 && pc->flag == DIFFERENT_NONZERO_PATTERN) {
517       /* destroy old matrices */
518       if (red->pmats) {
519         ierr = MatDestroy(red->pmats);CHKERRQ(ierr);
520       }
521     } else if (pc->setupcalled == 1) {
522       reuse = MAT_REUSE_MATRIX;
523       str   = SAME_NONZERO_PATTERN;
524     }
525 
526     /* grab the parallel matrix and put it into processors of a subcomminicator */
527     /*--------------------------------------------------------------------------*/
528     ierr = VecGetLocalSize(red->ysub,&mlocal_sub);CHKERRQ(ierr);
529     ierr = MatGetRedundantMatrix_AIJ(pc->pmat,red->nsubcomm,red->subcomm,mlocal_sub,reuse,&red->pmats);CHKERRQ(ierr);
530     /* tell PC of the subcommunicator its operators */
531     ierr = PCSetOperators(red->pc,red->pmats,red->pmats,str);CHKERRQ(ierr);
532   } else {
533     ierr = PCSetOperators(red->pc,pc->mat,pc->pmat,pc->flag);CHKERRQ(ierr);
534   }
535   ierr = PCSetFromOptions(red->pc);CHKERRQ(ierr);
536   ierr = PCSetUp(red->pc);CHKERRQ(ierr);
537   PetscFunctionReturn(0);
538 }
539 
540 #undef __FUNCT__
541 #define __FUNCT__ "PCApply_Redundant"
542 static PetscErrorCode PCApply_Redundant(PC pc,Vec x,Vec y)
543 {
544   PC_Redundant   *red = (PC_Redundant*)pc->data;
545   PetscErrorCode ierr;
546   PetscScalar    *array;
547 
548   PetscFunctionBegin;
549   /* scatter x to xdup */
550   ierr = VecScatterBegin(x,red->xdup,INSERT_VALUES,SCATTER_FORWARD,red->scatterin);CHKERRQ(ierr);
551   ierr = VecScatterEnd(x,red->xdup,INSERT_VALUES,SCATTER_FORWARD,red->scatterin);CHKERRQ(ierr);
552 
553   /* place xdup's local array into xsub */
554   ierr = VecGetArray(red->xdup,&array);CHKERRQ(ierr);
555   ierr = VecPlaceArray(red->xsub,(const PetscScalar*)array);CHKERRQ(ierr);
556 
557   /* apply preconditioner on each processor */
558   ierr = PCApply(red->pc,red->xsub,red->ysub);CHKERRQ(ierr);
559   ierr = VecResetArray(red->xsub);CHKERRQ(ierr);
560   ierr = VecRestoreArray(red->xdup,&array);CHKERRQ(ierr);
561 
562   /* place ysub's local array into ydup */
563   ierr = VecGetArray(red->ysub,&array);CHKERRQ(ierr);
564   ierr = VecPlaceArray(red->ydup,(const PetscScalar*)array);CHKERRQ(ierr);
565 
566   /* scatter ydup to y */
567   ierr = VecScatterBegin(red->ydup,y,INSERT_VALUES,SCATTER_FORWARD,red->scatterout);CHKERRQ(ierr);
568   ierr = VecScatterEnd(red->ydup,y,INSERT_VALUES,SCATTER_FORWARD,red->scatterout);CHKERRQ(ierr);
569   ierr = VecResetArray(red->ydup);CHKERRQ(ierr);
570   ierr = VecRestoreArray(red->ysub,&array);CHKERRQ(ierr);
571   PetscFunctionReturn(0);
572 }
573 
574 #undef __FUNCT__
575 #define __FUNCT__ "PCDestroy_Redundant"
576 static PetscErrorCode PCDestroy_Redundant(PC pc)
577 {
578   PC_Redundant   *red = (PC_Redundant*)pc->data;
579   PetscErrorCode ierr;
580 
581   PetscFunctionBegin;
582   if (red->scatterin)  {ierr = VecScatterDestroy(red->scatterin);CHKERRQ(ierr);}
583   if (red->scatterout) {ierr = VecScatterDestroy(red->scatterout);CHKERRQ(ierr);}
584   if (red->ysub)       {ierr = VecDestroy(red->ysub);CHKERRQ(ierr);}
585   if (red->xsub)       {ierr = VecDestroy(red->xsub);CHKERRQ(ierr);}
586   if (red->xdup)       {ierr = VecDestroy(red->xdup);CHKERRQ(ierr);}
587   if (red->ydup)       {ierr = VecDestroy(red->ydup);CHKERRQ(ierr);}
588   if (red->pmats) {
589     ierr = MatDestroy(red->pmats);CHKERRQ(ierr);
590   }
591 
592 
593   ierr = PCDestroy(red->pc);CHKERRQ(ierr);
594   ierr = PetscFree(red);CHKERRQ(ierr);
595   PetscFunctionReturn(0);
596 }
597 
598 #undef __FUNCT__
599 #define __FUNCT__ "PCSetFromOptions_Redundant"
600 static PetscErrorCode PCSetFromOptions_Redundant(PC pc)
601 {
602   PetscFunctionBegin;
603   PetscFunctionReturn(0);
604 }
605 
606 EXTERN_C_BEGIN
607 #undef __FUNCT__
608 #define __FUNCT__ "PCRedundantSetScatter_Redundant"
609 PetscErrorCode PETSCKSP_DLLEXPORT PCRedundantSetScatter_Redundant(PC pc,VecScatter in,VecScatter out)
610 {
611   PC_Redundant   *red = (PC_Redundant*)pc->data;
612   PetscErrorCode ierr;
613 
614   PetscFunctionBegin;
615   red->scatterin  = in;
616   red->scatterout = out;
617   ierr = PetscObjectReference((PetscObject)in);CHKERRQ(ierr);
618   ierr = PetscObjectReference((PetscObject)out);CHKERRQ(ierr);
619   PetscFunctionReturn(0);
620 }
621 EXTERN_C_END
622 
623 #undef __FUNCT__
624 #define __FUNCT__ "PCRedundantSetScatter"
625 /*@
626    PCRedundantSetScatter - Sets the scatter used to copy values into the
627      redundant local solve and the scatter to move them back into the global
628      vector.
629 
630    Collective on PC
631 
632    Input Parameters:
633 +  pc - the preconditioner context
634 .  in - the scatter to move the values in
635 -  out - the scatter to move them out
636 
637    Level: advanced
638 
639 .keywords: PC, redundant solve
640 @*/
641 PetscErrorCode PETSCKSP_DLLEXPORT PCRedundantSetScatter(PC pc,VecScatter in,VecScatter out)
642 {
643   PetscErrorCode ierr,(*f)(PC,VecScatter,VecScatter);
644 
645   PetscFunctionBegin;
646   PetscValidHeaderSpecific(pc,PC_COOKIE,1);
647   PetscValidHeaderSpecific(in,VEC_SCATTER_COOKIE,2);
648   PetscValidHeaderSpecific(out,VEC_SCATTER_COOKIE,3);
649   ierr = PetscObjectQueryFunction((PetscObject)pc,"PCRedundantSetScatter_C",(void (**)(void))&f);CHKERRQ(ierr);
650   if (f) {
651     ierr = (*f)(pc,in,out);CHKERRQ(ierr);
652   }
653   PetscFunctionReturn(0);
654 }
655 
656 EXTERN_C_BEGIN
657 #undef __FUNCT__
658 #define __FUNCT__ "PCRedundantGetPC_Redundant"
659 PetscErrorCode PETSCKSP_DLLEXPORT PCRedundantGetPC_Redundant(PC pc,PC *innerpc)
660 {
661   PC_Redundant *red = (PC_Redundant*)pc->data;
662 
663   PetscFunctionBegin;
664   *innerpc = red->pc;
665   PetscFunctionReturn(0);
666 }
667 EXTERN_C_END
668 
669 #undef __FUNCT__
670 #define __FUNCT__ "PCRedundantGetPC"
671 /*@
672    PCRedundantGetPC - Gets the sequential PC created by the redundant PC.
673 
674    Not Collective
675 
676    Input Parameter:
677 .  pc - the preconditioner context
678 
679    Output Parameter:
680 .  innerpc - the sequential PC
681 
682    Level: advanced
683 
684 .keywords: PC, redundant solve
685 @*/
686 PetscErrorCode PETSCKSP_DLLEXPORT PCRedundantGetPC(PC pc,PC *innerpc)
687 {
688   PetscErrorCode ierr,(*f)(PC,PC*);
689 
690   PetscFunctionBegin;
691   PetscValidHeaderSpecific(pc,PC_COOKIE,1);
692   PetscValidPointer(innerpc,2);
693   ierr = PetscObjectQueryFunction((PetscObject)pc,"PCRedundantGetPC_C",(void (**)(void))&f);CHKERRQ(ierr);
694   if (f) {
695     ierr = (*f)(pc,innerpc);CHKERRQ(ierr);
696   }
697   PetscFunctionReturn(0);
698 }
699 
700 EXTERN_C_BEGIN
701 #undef __FUNCT__
702 #define __FUNCT__ "PCRedundantGetOperators_Redundant"
703 PetscErrorCode PETSCKSP_DLLEXPORT PCRedundantGetOperators_Redundant(PC pc,Mat *mat,Mat *pmat)
704 {
705   PC_Redundant *red = (PC_Redundant*)pc->data;
706 
707   PetscFunctionBegin;
708   if (mat)  *mat  = red->pmats;
709   if (pmat) *pmat = red->pmats;
710   PetscFunctionReturn(0);
711 }
712 EXTERN_C_END
713 
714 #undef __FUNCT__
715 #define __FUNCT__ "PCRedundantGetOperators"
716 /*@
717    PCRedundantGetOperators - gets the sequential matrix and preconditioner matrix
718 
719    Not Collective
720 
721    Input Parameter:
722 .  pc - the preconditioner context
723 
724    Output Parameters:
725 +  mat - the matrix
726 -  pmat - the (possibly different) preconditioner matrix
727 
728    Level: advanced
729 
730 .keywords: PC, redundant solve
731 @*/
732 PetscErrorCode PETSCKSP_DLLEXPORT PCRedundantGetOperators(PC pc,Mat *mat,Mat *pmat)
733 {
734   PetscErrorCode ierr,(*f)(PC,Mat*,Mat*);
735 
736   PetscFunctionBegin;
737   PetscValidHeaderSpecific(pc,PC_COOKIE,1);
738   if (mat)  PetscValidPointer(mat,2);
739   if (pmat) PetscValidPointer(pmat,3);
740   ierr = PetscObjectQueryFunction((PetscObject)pc,"PCRedundantGetOperators_C",(void (**)(void))&f);CHKERRQ(ierr);
741   if (f) {
742     ierr = (*f)(pc,mat,pmat);CHKERRQ(ierr);
743   }
744   PetscFunctionReturn(0);
745 }
746 
747 /* -------------------------------------------------------------------------------------*/
748 /*MC
749      PCREDUNDANT - Runs a preconditioner for the entire problem on each processor
750 
751 
752      Options for the redundant preconditioners can be set with -redundant_pc_xxx
753 
754    Level: intermediate
755 
756 
757 .seealso:  PCCreate(), PCSetType(), PCType (for list of available types), PCRedundantSetScatter(),
758            PCRedundantGetPC(), PCRedundantGetOperators()
759 
760 M*/
761 
762 EXTERN_C_BEGIN
763 #undef __FUNCT__
764 #define __FUNCT__ "PCCreate_Redundant"
765 PetscErrorCode PETSCKSP_DLLEXPORT PCCreate_Redundant(PC pc)
766 {
767   PetscErrorCode ierr;
768   PC_Redundant   *red;
769   const char     *prefix;
770   PetscInt       nsubcomm,np_subcomm,nleftover,i,j,color;
771   PetscMPIInt    rank,size,subrank,*subsize,duprank;
772   MPI_Comm       subcomm=0,dupcomm=0;
773 
774   PetscFunctionBegin;
775   ierr = PetscNew(PC_Redundant,&red);CHKERRQ(ierr);
776   ierr = PetscLogObjectMemory(pc,sizeof(PC_Redundant));CHKERRQ(ierr);
777   red->useparallelmat   = PETSC_TRUE;
778 
779   ierr = MPI_Comm_rank(pc->comm,&rank);CHKERRQ(ierr);
780   ierr = MPI_Comm_size(pc->comm,&size);CHKERRQ(ierr);
781   nsubcomm = size;
782   ierr = PetscOptionsGetInt(PETSC_NULL,"-nsubcomm",&nsubcomm,PETSC_NULL);CHKERRQ(ierr);
783   if (nsubcomm > size) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE, "Num of subcommunicators %D cannot be larger than MPI_Comm_size %D",nsubcomm,size);
784 
785   /*--------------------------------------------------------------------------------------------------
786     To avoid data scattering from subcomm back to original comm, we create subcomm by iteratively taking a
787     processe into a subcomm.
788     An example: size=4, nsubcomm=3
789      pc->comm:
790       rank:     [0]  [1]  [2]  [3]
791       color:     0    1    2    0
792 
793      subcomm:
794       subrank:  [0]  [0]  [0]  [1]
795 
796      dupcomm:
797       duprank:  [0]  [2]  [3]  [1]
798 
799      Here, subcomm[color = 0] has subsize=2, owns process [0] and [3]
800            subcomm[color = 1] has subsize=1, owns process [1]
801            subcomm[color = 2] has subsize=1, owns process [2]
802           dupcomm has same number of processes as pc->comm, and its duprank maps
803           processes in subcomm contiguously into a 1d array:
804             duprank: [0] [1]      [2]         [3]
805             rank:    [0] [3]      [1]         [2]
806                     subcomm[0] subcomm[1]  subcomm[2]
807    ----------------------------------------------------------------------------------------*/
808 
809   /* get size of each subcommunicators */
810   ierr = PetscMalloc((1+nsubcomm)*sizeof(PetscMPIInt),&subsize);CHKERRQ(ierr);
811   np_subcomm = size/nsubcomm;
812   nleftover  = size - nsubcomm*np_subcomm;
813   for (i=0; i<nsubcomm; i++){
814     subsize[i] = np_subcomm;
815     if (i<nleftover) subsize[i]++;
816   }
817 
818   /* find color for this proc */
819   color   = rank%nsubcomm;
820   subrank = rank/nsubcomm;
821 
822   ierr = MPI_Comm_split(pc->comm,color,subrank,&subcomm);CHKERRQ(ierr);
823   red->subcomm  = subcomm;
824   red->color    = color;
825   red->nsubcomm = nsubcomm;
826 
827   j = 0; duprank = 0;
828   for (i=0; i<nsubcomm; i++){
829     if (j == color){
830       duprank += subrank;
831       break;
832     }
833     duprank += subsize[i]; j++;
834   }
835 
836   /* create dupcomm with same size as comm, but its rank, duprank, maps subcomm's contiguously into dupcomm */
837   ierr = MPI_Comm_split(pc->comm,0,duprank,&dupcomm);CHKERRQ(ierr);
838   red->dupcomm = dupcomm;
839   ierr = PetscFree(subsize);CHKERRQ(ierr);
840   /* if (rank == 0) printf("[%d] subrank %d, duprank: %d\n",rank,subrank,duprank); */
841 
842   /* create the sequential PC that each processor has copy of */
843   ierr = PCCreate(subcomm,&red->pc);CHKERRQ(ierr);
844   ierr = PCSetType(red->pc,PCLU);CHKERRQ(ierr);
845   ierr = PCGetOptionsPrefix(pc,&prefix);CHKERRQ(ierr);
846   ierr = PCSetOptionsPrefix(red->pc,prefix);CHKERRQ(ierr);
847   ierr = PCAppendOptionsPrefix(red->pc,"redundant_");CHKERRQ(ierr);
848 
849   pc->ops->apply             = PCApply_Redundant;
850   pc->ops->applytranspose    = 0;
851   pc->ops->setup             = PCSetUp_Redundant;
852   pc->ops->destroy           = PCDestroy_Redundant;
853   pc->ops->setfromoptions    = PCSetFromOptions_Redundant;
854   pc->ops->setuponblocks     = 0;
855   pc->ops->view              = PCView_Redundant;
856   pc->ops->applyrichardson   = 0;
857 
858   pc->data     = (void*)red;
859 
860   ierr = PetscObjectComposeFunctionDynamic((PetscObject)pc,"PCRedundantSetScatter_C","PCRedundantSetScatter_Redundant",
861                     PCRedundantSetScatter_Redundant);CHKERRQ(ierr);
862   ierr = PetscObjectComposeFunctionDynamic((PetscObject)pc,"PCRedundantGetPC_C","PCRedundantGetPC_Redundant",
863                     PCRedundantGetPC_Redundant);CHKERRQ(ierr);
864   ierr = PetscObjectComposeFunctionDynamic((PetscObject)pc,"PCRedundantGetOperators_C","PCRedundantGetOperators_Redundant",
865                     PCRedundantGetOperators_Redundant);CHKERRQ(ierr);
866 
867   PetscFunctionReturn(0);
868 }
869 EXTERN_C_END
870