xref: /petsc/src/ksp/pc/impls/redundant/redundant.c (revision a98ce0f41cecbcf583ea61eee7d4bfcfd656d38d)
1 #define PETSCKSP_DLL
2 
3 /*
4   This file defines a "solve the problem redundantly on each subgroup of processor" preconditioner.
5 */
6 #include "private/pcimpl.h"     /*I "petscpc.h" I*/
7 #include "petscksp.h"
8 
9 typedef struct {
10   PC         pc;                   /* actual preconditioner used on each processor */
11   Vec        xsub,ysub;            /* vectors of a subcommunicator to hold parallel vectors of pc->comm */
12   Vec        xdup,ydup;            /* parallel vector that congregates xsub or ysub facilitating vector scattering */
13   Mat        pmats;                /* matrix and optional preconditioner matrix belong to a subcommunicator */
14   VecScatter scatterin,scatterout; /* scatter used to move all values to each processor group (subcommunicator) */
15   PetscTruth useparallelmat;
16   MPI_Comm   subcomm;              /* processors belong to a subcommunicator implement a PC in parallel */
17   MPI_Comm   dupcomm;              /* processors belong to pc->comm with their rank remapped in the way
18                                       that vector xdup/ydup has contiguous rank while appending xsub/ysub along their colors */
19   PetscInt   nsubcomm;             /* num of subcommunicators, which equals the num of redundant matrix systems */
20   PetscInt   color;                /* color of processors in a subcommunicator */
21 } PC_Redundant;
22 
23 #undef __FUNCT__
24 #define __FUNCT__ "PCView_Redundant"
25 static PetscErrorCode PCView_Redundant(PC pc,PetscViewer viewer)
26 {
27   PC_Redundant   *red = (PC_Redundant*)pc->data;
28   PetscErrorCode ierr;
29   PetscMPIInt    rank;
30   PetscTruth     iascii,isstring;
31   PetscViewer    sviewer,subviewer;
32   PetscInt       color = red->color;
33 
34   PetscFunctionBegin;
35   ierr = MPI_Comm_rank(pc->comm,&rank);CHKERRQ(ierr);
36   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&iascii);CHKERRQ(ierr);
37   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_STRING,&isstring);CHKERRQ(ierr);
38   if (iascii) {
39     ierr = PetscViewerASCIIPrintf(viewer,"  Redundant solver preconditioner: First PC (color=0) follows\n");CHKERRQ(ierr);
40     ierr = PetscViewerGetSubcomm(viewer,red->pc->comm,&subviewer);CHKERRQ(ierr);
41     if (!color) { /* only view first redundant pc */
42       ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
43       ierr = PCView(red->pc,subviewer);CHKERRQ(ierr);
44       ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
45     }
46     ierr = PetscViewerRestoreSubcomm(viewer,red->pc->comm,&subviewer);CHKERRQ(ierr);
47   } else if (isstring) { /* not test it yet! */
48     ierr = PetscViewerStringSPrintf(viewer," Redundant solver preconditioner");CHKERRQ(ierr);
49     ierr = PetscViewerGetSingleton(viewer,&sviewer);CHKERRQ(ierr);
50     if (!rank) {
51       ierr = PCView(red->pc,sviewer);CHKERRQ(ierr);
52     }
53     ierr = PetscViewerRestoreSingleton(viewer,&sviewer);CHKERRQ(ierr);
54   } else {
55     SETERRQ1(PETSC_ERR_SUP,"Viewer type %s not supported for PC redundant",((PetscObject)viewer)->type_name);
56   }
57   PetscFunctionReturn(0);
58 }
59 
60 #include "include/private/matimpl.h"        /*I "petscmat.h" I*/
61 #include "private/vecimpl.h"
62 #include "src/mat/impls/aij/mpi/mpiaij.h"   /*I "petscmat.h" I*/
63 #include "src/mat/impls/aij/seq/aij.h"      /*I "petscmat.h" I*/
64 
65 typedef struct { /* used by MatGetRedundantMatrix() for reusing matredundant */
66   PetscInt       nzlocal,nsends,nrecvs;
67   PetscInt       *send_rank,*sbuf_nz,*sbuf_j,**rbuf_j;
68   PetscScalar    *sbuf_a,**rbuf_a;
69   PetscErrorCode (*MatDestroy)(Mat);
70 } Mat_Redundant;
71 
72 #undef __FUNCT__
73 #define __FUNCT__ "PetscContainerDestroy_MatRedundant"
74 PetscErrorCode PetscContainerDestroy_MatRedundant(void *ptr)
75 {
76   PetscErrorCode       ierr;
77   Mat_Redundant        *redund=(Mat_Redundant*)ptr;
78   PetscInt             i;
79 
80   PetscFunctionBegin;
81   ierr = PetscFree(redund->send_rank);CHKERRQ(ierr);
82   ierr = PetscFree(redund->sbuf_j);CHKERRQ(ierr);
83   ierr = PetscFree(redund->sbuf_a);CHKERRQ(ierr);
84   for (i=0; i<redund->nrecvs; i++){
85     ierr = PetscFree(redund->rbuf_j[i]);CHKERRQ(ierr);
86     ierr = PetscFree(redund->rbuf_a[i]);CHKERRQ(ierr);
87   }
88   ierr = PetscFree3(redund->sbuf_nz,redund->rbuf_j,redund->rbuf_a);CHKERRQ(ierr);
89   ierr = PetscFree(redund);CHKERRQ(ierr);
90   PetscFunctionReturn(0);
91 }
92 
93 #undef __FUNCT__
94 #define __FUNCT__ "MatDestroy_MatRedundant"
95 PetscErrorCode MatDestroy_MatRedundant(Mat A)
96 {
97   PetscErrorCode       ierr;
98   PetscContainer container;
99   Mat_Redundant        *redund=PETSC_NULL;
100 
101   PetscFunctionBegin;
102   ierr = PetscObjectQuery((PetscObject)A,"Mat_Redundant",(PetscObject *)&container);CHKERRQ(ierr);
103   if (container) {
104     ierr = PetscContainerGetPointer(container,(void **)&redund);CHKERRQ(ierr);
105   } else {
106     SETERRQ(PETSC_ERR_PLIB,"Container does not exit");
107   }
108   A->ops->destroy = redund->MatDestroy;
109   ierr = PetscObjectCompose((PetscObject)A,"Mat_Redundant",0);CHKERRQ(ierr);
110   ierr = (*A->ops->destroy)(A);CHKERRQ(ierr);
111   ierr = PetscContainerDestroy(container);CHKERRQ(ierr);
112   PetscFunctionReturn(0);
113 }
114 
115 #undef __FUNCT__
116 #define __FUNCT__ "MatGetRedundantMatrix"
117 PetscErrorCode MatGetRedundantMatrix_AIJ(Mat mat,PetscInt nsubcomm,MPI_Comm subcomm,PetscInt mlocal_sub,MatReuse reuse,Mat *matredundant)
118 {
119   PetscMPIInt    rank,size;
120   MPI_Comm       comm=mat->comm;
121   PetscErrorCode ierr;
122   PetscInt       nsends=0,nrecvs=0,i,rownz_max=0;
123   PetscMPIInt    *send_rank=PETSC_NULL,*recv_rank=PETSC_NULL;
124   PetscInt       *rowrange=mat->rmap.range;
125   Mat_MPIAIJ     *aij = (Mat_MPIAIJ*)mat->data;
126   Mat            A=aij->A,B=aij->B,C=*matredundant;
127   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data;
128   PetscScalar    *sbuf_a;
129   PetscInt       nzlocal=a->nz+b->nz;
130   PetscInt       j,cstart=mat->cmap.rstart,cend=mat->cmap.rend,row,nzA,nzB,ncols,*cworkA,*cworkB;
131   PetscInt       rstart=mat->rmap.rstart,rend=mat->rmap.rend,*bmap=aij->garray,M,N;
132   PetscInt       *cols,ctmp,lwrite,*rptr,l,*sbuf_j;
133   PetscScalar    *vals,*aworkA,*aworkB;
134   PetscMPIInt    tag1,tag2,tag3,imdex;
135   MPI_Request    *s_waits1=PETSC_NULL,*s_waits2=PETSC_NULL,*s_waits3=PETSC_NULL,
136                  *r_waits1=PETSC_NULL,*r_waits2=PETSC_NULL,*r_waits3=PETSC_NULL;
137   MPI_Status     recv_status,*send_status;
138   PetscInt       *sbuf_nz=PETSC_NULL,*rbuf_nz=PETSC_NULL,count;
139   PetscInt       **rbuf_j=PETSC_NULL;
140   PetscScalar    **rbuf_a=PETSC_NULL;
141   Mat_Redundant  *redund=PETSC_NULL;
142   PetscContainer container;
143 
144   PetscFunctionBegin;
145   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
146   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
147 
148   if (reuse == MAT_REUSE_MATRIX) {
149     ierr = MatGetSize(C,&M,&N);CHKERRQ(ierr);
150     if (M != N || M != mat->rmap.N) SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. Wrong global size");
151     ierr = MatGetLocalSize(C,&M,&N);CHKERRQ(ierr);
152     if (M != N || M != mlocal_sub) SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. Wrong local size");
153     ierr = PetscObjectQuery((PetscObject)C,"Mat_Redundant",(PetscObject *)&container);CHKERRQ(ierr);
154     if (container) {
155       ierr = PetscContainerGetPointer(container,(void **)&redund);CHKERRQ(ierr);
156     } else {
157       SETERRQ(PETSC_ERR_PLIB,"Container does not exit");
158     }
159     if (nzlocal != redund->nzlocal) SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. Wrong nzlocal");
160 
161     nsends    = redund->nsends;
162     nrecvs    = redund->nrecvs;
163     send_rank = redund->send_rank; recv_rank = send_rank + size;
164     sbuf_nz   = redund->sbuf_nz;     rbuf_nz = sbuf_nz + nsends;
165     sbuf_j    = redund->sbuf_j;
166     sbuf_a    = redund->sbuf_a;
167     rbuf_j    = redund->rbuf_j;
168     rbuf_a    = redund->rbuf_a;
169   }
170 
171   if (reuse == MAT_INITIAL_MATRIX){
172     PetscMPIInt  subrank,subsize;
173     PetscInt     nleftover,np_subcomm;
174     /* get the destination processors' id send_rank, nsends and nrecvs */
175     ierr = MPI_Comm_rank(subcomm,&subrank);CHKERRQ(ierr);
176     ierr = MPI_Comm_size(subcomm,&subsize);CHKERRQ(ierr);
177     ierr = PetscMalloc((2*size+1)*sizeof(PetscMPIInt),&send_rank);
178     recv_rank = send_rank + size;
179     np_subcomm = size/nsubcomm;
180     nleftover  = size - nsubcomm*np_subcomm;
181     nsends = 0; nrecvs = 0;
182     for (i=0; i<size; i++){ /* i=rank*/
183       if (subrank == i/nsubcomm && rank != i){ /* my_subrank == other's subrank */
184         send_rank[nsends] = i; nsends++;
185         recv_rank[nrecvs++] = i;
186       }
187     }
188     if (rank >= size - nleftover){/* this proc is a leftover processor */
189       i = size-nleftover-1;
190       j = 0;
191       while (j < nsubcomm - nleftover){
192         send_rank[nsends++] = i;
193         i--; j++;
194       }
195     }
196 
197     if (nleftover && subsize == size/nsubcomm && subrank==subsize-1){ /* this proc recvs from leftover processors */
198       for (i=0; i<nleftover; i++){
199         recv_rank[nrecvs++] = size-nleftover+i;
200       }
201     }
202 
203     /* allocate sbuf_j, sbuf_a */
204     i = nzlocal + rowrange[rank+1] - rowrange[rank] + 2;
205     ierr = PetscMalloc(i*sizeof(PetscInt),&sbuf_j);CHKERRQ(ierr);
206     ierr = PetscMalloc((nzlocal+1)*sizeof(PetscScalar),&sbuf_a);CHKERRQ(ierr);
207   } /* endof if (reuse == MAT_INITIAL_MATRIX) */
208 
209   /* copy mat's local entries into the buffers */
210   if (reuse == MAT_INITIAL_MATRIX){
211     rownz_max = 0;
212     rptr = sbuf_j;
213     cols = sbuf_j + rend-rstart + 1;
214     vals = sbuf_a;
215     rptr[0] = 0;
216     for (i=0; i<rend-rstart; i++){
217       row = i + rstart;
218       nzA    = a->i[i+1] - a->i[i]; nzB = b->i[i+1] - b->i[i];
219       ncols  = nzA + nzB;
220       cworkA = a->j + a->i[i]; cworkB = b->j + b->i[i];
221       aworkA = a->a + a->i[i]; aworkB = b->a + b->i[i];
222       /* load the column indices for this row into cols */
223       lwrite = 0;
224       for (l=0; l<nzB; l++) {
225         if ((ctmp = bmap[cworkB[l]]) < cstart){
226           vals[lwrite]   = aworkB[l];
227           cols[lwrite++] = ctmp;
228         }
229       }
230       for (l=0; l<nzA; l++){
231         vals[lwrite]   = aworkA[l];
232         cols[lwrite++] = cstart + cworkA[l];
233       }
234       for (l=0; l<nzB; l++) {
235         if ((ctmp = bmap[cworkB[l]]) >= cend){
236           vals[lwrite]   = aworkB[l];
237           cols[lwrite++] = ctmp;
238         }
239       }
240       vals += ncols;
241       cols += ncols;
242       rptr[i+1] = rptr[i] + ncols;
243       if (rownz_max < ncols) rownz_max = ncols;
244     }
245     if (rptr[rend-rstart] != a->nz + b->nz) SETERRQ4(1, "rptr[%d] %d != %d + %d",rend-rstart,rptr[rend-rstart+1],a->nz,b->nz);
246   } else { /* only copy matrix values into sbuf_a */
247     rptr = sbuf_j;
248     vals = sbuf_a;
249     rptr[0] = 0;
250     for (i=0; i<rend-rstart; i++){
251       row = i + rstart;
252       nzA    = a->i[i+1] - a->i[i]; nzB = b->i[i+1] - b->i[i];
253       ncols  = nzA + nzB;
254       cworkA = a->j + a->i[i]; cworkB = b->j + b->i[i];
255       aworkA = a->a + a->i[i]; aworkB = b->a + b->i[i];
256       lwrite = 0;
257       for (l=0; l<nzB; l++) {
258         if ((ctmp = bmap[cworkB[l]]) < cstart) vals[lwrite++] = aworkB[l];
259       }
260       for (l=0; l<nzA; l++) vals[lwrite++] = aworkA[l];
261       for (l=0; l<nzB; l++) {
262         if ((ctmp = bmap[cworkB[l]]) >= cend) vals[lwrite++] = aworkB[l];
263       }
264       vals += ncols;
265       rptr[i+1] = rptr[i] + ncols;
266     }
267   } /* endof if (reuse == MAT_INITIAL_MATRIX) */
268 
269   /* send nzlocal to others, and recv other's nzlocal */
270   /*--------------------------------------------------*/
271   if (reuse == MAT_INITIAL_MATRIX){
272     ierr = PetscMalloc2(3*(nsends + nrecvs)+1,MPI_Request,&s_waits3,nsends+1,MPI_Status,&send_status);CHKERRQ(ierr);
273     s_waits2 = s_waits3 + nsends;
274     s_waits1 = s_waits2 + nsends;
275     r_waits1 = s_waits1 + nsends;
276     r_waits2 = r_waits1 + nrecvs;
277     r_waits3 = r_waits2 + nrecvs;
278   } else {
279     ierr = PetscMalloc2(nsends + nrecvs +1,MPI_Request,&s_waits3,nsends+1,MPI_Status,&send_status);CHKERRQ(ierr);
280     r_waits3 = s_waits3 + nsends;
281   }
282 
283   ierr = PetscObjectGetNewTag((PetscObject)mat,&tag3);CHKERRQ(ierr);
284   if (reuse == MAT_INITIAL_MATRIX){
285     /* get new tags to keep the communication clean */
286     ierr = PetscObjectGetNewTag((PetscObject)mat,&tag1);CHKERRQ(ierr);
287     ierr = PetscObjectGetNewTag((PetscObject)mat,&tag2);CHKERRQ(ierr);
288     ierr = PetscMalloc3(nsends+nrecvs+1,PetscInt,&sbuf_nz,nrecvs,PetscInt*,&rbuf_j,nrecvs,PetscScalar*,&rbuf_a);CHKERRQ(ierr);
289     rbuf_nz = sbuf_nz + nsends;
290 
291     /* post receives of other's nzlocal */
292     for (i=0; i<nrecvs; i++){
293       ierr = MPI_Irecv(rbuf_nz+i,1,MPIU_INT,MPI_ANY_SOURCE,tag1,comm,r_waits1+i);CHKERRQ(ierr);
294     }
295     /* send nzlocal to others */
296     for (i=0; i<nsends; i++){
297       sbuf_nz[i] = nzlocal;
298       ierr = MPI_Isend(sbuf_nz+i,1,MPIU_INT,send_rank[i],tag1,comm,s_waits1+i);CHKERRQ(ierr);
299     }
300     /* wait on receives of nzlocal; allocate space for rbuf_j, rbuf_a */
301     count = nrecvs;
302     while (count) {
303       ierr = MPI_Waitany(nrecvs,r_waits1,&imdex,&recv_status);CHKERRQ(ierr);
304       recv_rank[imdex] = recv_status.MPI_SOURCE;
305       /* allocate rbuf_a and rbuf_j; then post receives of rbuf_j */
306       ierr = PetscMalloc((rbuf_nz[imdex]+1)*sizeof(PetscScalar),&rbuf_a[imdex]);CHKERRQ(ierr);
307 
308       i = rowrange[recv_status.MPI_SOURCE+1] - rowrange[recv_status.MPI_SOURCE]; /* number of expected mat->i */
309       rbuf_nz[imdex] += i + 2;
310       ierr = PetscMalloc(rbuf_nz[imdex]*sizeof(PetscInt),&rbuf_j[imdex]);CHKERRQ(ierr);
311       ierr = MPI_Irecv(rbuf_j[imdex],rbuf_nz[imdex],MPIU_INT,recv_status.MPI_SOURCE,tag2,comm,r_waits2+imdex);CHKERRQ(ierr);
312       count--;
313     }
314     /* wait on sends of nzlocal */
315     if (nsends) {ierr = MPI_Waitall(nsends,s_waits1,send_status);CHKERRQ(ierr);}
316     /* send mat->i,j to others, and recv from other's */
317     /*------------------------------------------------*/
318     for (i=0; i<nsends; i++){
319       j = nzlocal + rowrange[rank+1] - rowrange[rank] + 1;
320       ierr = MPI_Isend(sbuf_j,j,MPIU_INT,send_rank[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
321     }
322     /* wait on receives of mat->i,j */
323     /*------------------------------*/
324     count = nrecvs;
325     while (count) {
326       ierr = MPI_Waitany(nrecvs,r_waits2,&imdex,&recv_status);CHKERRQ(ierr);
327       if (recv_rank[imdex] != recv_status.MPI_SOURCE) SETERRQ2(1, "recv_rank %d != MPI_SOURCE %d",recv_rank[imdex],recv_status.MPI_SOURCE);
328       count--;
329     }
330     /* wait on sends of mat->i,j */
331     /*---------------------------*/
332     if (nsends) {
333       ierr = MPI_Waitall(nsends,s_waits2,send_status);CHKERRQ(ierr);
334     }
335   } /* endof if (reuse == MAT_INITIAL_MATRIX) */
336 
337   /* post receives, send and receive mat->a */
338   /*----------------------------------------*/
339   for (imdex=0; imdex<nrecvs; imdex++) {
340     ierr = MPI_Irecv(rbuf_a[imdex],rbuf_nz[imdex],MPIU_SCALAR,recv_rank[imdex],tag3,comm,r_waits3+imdex);CHKERRQ(ierr);
341   }
342   for (i=0; i<nsends; i++){
343     ierr = MPI_Isend(sbuf_a,nzlocal,MPIU_SCALAR,send_rank[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
344   }
345   count = nrecvs;
346   while (count) {
347     ierr = MPI_Waitany(nrecvs,r_waits3,&imdex,&recv_status);CHKERRQ(ierr);
348     if (recv_rank[imdex] != recv_status.MPI_SOURCE) SETERRQ2(1, "recv_rank %d != MPI_SOURCE %d",recv_rank[imdex],recv_status.MPI_SOURCE);
349     count--;
350   }
351   if (nsends) {
352     ierr = MPI_Waitall(nsends,s_waits3,send_status);CHKERRQ(ierr);
353   }
354 
355   ierr = PetscFree2(s_waits3,send_status);CHKERRQ(ierr);
356 
357   /* create redundant matrix */
358   /*-------------------------*/
359   if (reuse == MAT_INITIAL_MATRIX){
360     /* compute rownz_max for preallocation */
361     for (imdex=0; imdex<nrecvs; imdex++){
362       j = rowrange[recv_rank[imdex]+1] - rowrange[recv_rank[imdex]];
363       rptr = rbuf_j[imdex];
364       for (i=0; i<j; i++){
365         ncols = rptr[i+1] - rptr[i];
366         if (rownz_max < ncols) rownz_max = ncols;
367       }
368     }
369 
370     ierr = MatCreate(subcomm,&C);CHKERRQ(ierr);
371     ierr = MatSetSizes(C,mlocal_sub,mlocal_sub,PETSC_DECIDE,PETSC_DECIDE);CHKERRQ(ierr);
372     ierr = MatSetFromOptions(C);CHKERRQ(ierr);
373     ierr = MatSeqAIJSetPreallocation(C,rownz_max,PETSC_NULL);CHKERRQ(ierr);
374     ierr = MatMPIAIJSetPreallocation(C,rownz_max,PETSC_NULL,rownz_max,PETSC_NULL);CHKERRQ(ierr);
375   } else {
376     C = *matredundant;
377   }
378 
379   /* insert local matrix entries */
380   rptr = sbuf_j;
381   cols = sbuf_j + rend-rstart + 1;
382   vals = sbuf_a;
383   for (i=0; i<rend-rstart; i++){
384     row   = i + rstart;
385     ncols = rptr[i+1] - rptr[i];
386     ierr = MatSetValues(C,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
387     vals += ncols;
388     cols += ncols;
389   }
390   /* insert received matrix entries */
391   for (imdex=0; imdex<nrecvs; imdex++){
392     rstart = rowrange[recv_rank[imdex]];
393     rend   = rowrange[recv_rank[imdex]+1];
394     rptr = rbuf_j[imdex];
395     cols = rbuf_j[imdex] + rend-rstart + 1;
396     vals = rbuf_a[imdex];
397     for (i=0; i<rend-rstart; i++){
398       row   = i + rstart;
399       ncols = rptr[i+1] - rptr[i];
400       ierr = MatSetValues(C,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
401       vals += ncols;
402       cols += ncols;
403     }
404   }
405   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
406   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
407   ierr = MatGetSize(C,&M,&N);CHKERRQ(ierr);
408   if (M != mat->rmap.N || N != mat->cmap.N) SETERRQ2(PETSC_ERR_ARG_INCOMP,"redundant mat size %d != input mat size %d",M,mat->rmap.N);
409   if (reuse == MAT_INITIAL_MATRIX){
410     PetscContainer container;
411     *matredundant = C;
412     /* create a supporting struct and attach it to C for reuse */
413     ierr = PetscNew(Mat_Redundant,&redund);CHKERRQ(ierr);
414     ierr = PetscContainerCreate(PETSC_COMM_SELF,&container);CHKERRQ(ierr);
415     ierr = PetscContainerSetPointer(container,redund);CHKERRQ(ierr);
416     ierr = PetscObjectCompose((PetscObject)C,"Mat_Redundant",(PetscObject)container);CHKERRQ(ierr);
417     ierr = PetscContainerSetUserDestroy(container,PetscContainerDestroy_MatRedundant);CHKERRQ(ierr);
418 
419     redund->nzlocal = nzlocal;
420     redund->nsends  = nsends;
421     redund->nrecvs  = nrecvs;
422     redund->send_rank = send_rank;
423     redund->sbuf_nz = sbuf_nz;
424     redund->sbuf_j  = sbuf_j;
425     redund->sbuf_a  = sbuf_a;
426     redund->rbuf_j  = rbuf_j;
427     redund->rbuf_a  = rbuf_a;
428 
429     redund->MatDestroy = C->ops->destroy;
430     C->ops->destroy    = MatDestroy_MatRedundant;
431   }
432   PetscFunctionReturn(0);
433 }
434 
435 #undef __FUNCT__
436 #define __FUNCT__ "PCSetUp_Redundant"
437 static PetscErrorCode PCSetUp_Redundant(PC pc)
438 {
439   PC_Redundant   *red  = (PC_Redundant*)pc->data;
440   PetscErrorCode ierr;
441   PetscInt       mstart,mend,mlocal,m;
442   PetscMPIInt    size;
443   MatReuse       reuse = MAT_INITIAL_MATRIX;
444   MatStructure   str   = DIFFERENT_NONZERO_PATTERN;
445   MPI_Comm       comm;
446   Vec            vec;
447 
448   PetscInt       mlocal_sub;
449   PetscMPIInt    subsize,subrank;
450   PetscInt       rstart_sub,rend_sub,mloc_sub;
451 
452   PetscFunctionBegin;
453   ierr = PetscObjectGetComm((PetscObject)pc->pmat,&comm);CHKERRQ(ierr);
454   ierr = MatGetVecs(pc->pmat,&vec,0);CHKERRQ(ierr);
455   ierr = PCSetFromOptions(red->pc);CHKERRQ(ierr);
456   ierr = VecGetSize(vec,&m);CHKERRQ(ierr);
457   if (!pc->setupcalled) {
458     /* create working vectors xsub/ysub and xdup/ydup */
459     ierr = VecGetLocalSize(vec,&mlocal);CHKERRQ(ierr);
460     ierr = VecGetOwnershipRange(vec,&mstart,&mend);CHKERRQ(ierr);
461 
462     /* get local size of xsub/ysub */
463     ierr = MPI_Comm_size(red->subcomm,&subsize);CHKERRQ(ierr);
464     ierr = MPI_Comm_rank(red->subcomm,&subrank);CHKERRQ(ierr);
465     rstart_sub = pc->pmat->rmap.range[red->nsubcomm*subrank]; /* rstart in xsub/ysub */
466     if (subrank+1 < subsize){
467       rend_sub = pc->pmat->rmap.range[red->nsubcomm*(subrank+1)];
468     } else {
469       rend_sub = m;
470     }
471     mloc_sub = rend_sub - rstart_sub;
472     ierr = VecCreateMPI(red->subcomm,mloc_sub,PETSC_DECIDE,&red->ysub);CHKERRQ(ierr);
473     /* create xsub with empty local arrays, because xdup's arrays will be placed into it */
474     ierr = VecCreateMPIWithArray(red->subcomm,mloc_sub,PETSC_DECIDE,PETSC_NULL,&red->xsub);CHKERRQ(ierr);
475 
476     /* create xdup and ydup. ydup has empty local arrays because ysub's arrays will be place into it.
477        Note: we use communicator dupcomm, not pc->comm! */
478     ierr = VecCreateMPI(red->dupcomm,mloc_sub,PETSC_DECIDE,&red->xdup);CHKERRQ(ierr);
479     ierr = VecCreateMPIWithArray(red->dupcomm,mloc_sub,PETSC_DECIDE,PETSC_NULL,&red->ydup);CHKERRQ(ierr);
480 
481     /* create vec scatters */
482     if (!red->scatterin){
483       IS       is1,is2;
484       PetscInt *idx1,*idx2,i,j,k;
485       ierr = PetscMalloc(2*red->nsubcomm*mlocal*sizeof(PetscInt),&idx1);CHKERRQ(ierr);
486       idx2 = idx1 + red->nsubcomm*mlocal;
487       j = 0;
488       for (k=0; k<red->nsubcomm; k++){
489         for (i=mstart; i<mend; i++){
490           idx1[j]   = i;
491           idx2[j++] = i + m*k;
492         }
493       }
494       ierr = ISCreateGeneral(comm,red->nsubcomm*mlocal,idx1,&is1);CHKERRQ(ierr);
495       ierr = ISCreateGeneral(comm,red->nsubcomm*mlocal,idx2,&is2);CHKERRQ(ierr);
496       ierr = VecScatterCreate(vec,is1,red->xdup,is2,&red->scatterin);CHKERRQ(ierr);
497       ierr = ISDestroy(is1);CHKERRQ(ierr);
498       ierr = ISDestroy(is2);CHKERRQ(ierr);
499 
500       ierr = ISCreateStride(comm,mlocal,mstart+ red->color*m,1,&is1);CHKERRQ(ierr);
501       ierr = ISCreateStride(comm,mlocal,mstart,1,&is2);CHKERRQ(ierr);
502       ierr = VecScatterCreate(red->xdup,is1,vec,is2,&red->scatterout);CHKERRQ(ierr);
503       ierr = ISDestroy(is1);CHKERRQ(ierr);
504       ierr = ISDestroy(is2);CHKERRQ(ierr);
505       ierr = PetscFree(idx1);CHKERRQ(ierr);
506     }
507   }
508   ierr = VecDestroy(vec);CHKERRQ(ierr);
509 
510   /* if pmatrix set by user is sequential then we do not need to gather the parallel matrix */
511   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
512   if (size == 1) {
513     red->useparallelmat = PETSC_FALSE;
514   }
515 
516   if (red->useparallelmat) {
517     if (pc->setupcalled == 1 && pc->flag == DIFFERENT_NONZERO_PATTERN) {
518       /* destroy old matrices */
519       if (red->pmats) {
520         ierr = MatDestroy(red->pmats);CHKERRQ(ierr);
521       }
522     } else if (pc->setupcalled == 1) {
523       reuse = MAT_REUSE_MATRIX;
524       str   = SAME_NONZERO_PATTERN;
525     }
526 
527     /* grab the parallel matrix and put it into processors of a subcomminicator */
528     /*--------------------------------------------------------------------------*/
529     ierr = VecGetLocalSize(red->ysub,&mlocal_sub);CHKERRQ(ierr);
530     ierr = MatGetRedundantMatrix_AIJ(pc->pmat,red->nsubcomm,red->subcomm,mlocal_sub,reuse,&red->pmats);CHKERRQ(ierr);
531     /* tell PC of the subcommunicator its operators */
532     ierr = PCSetOperators(red->pc,red->pmats,red->pmats,str);CHKERRQ(ierr);
533   } else {
534     ierr = PCSetOperators(red->pc,pc->mat,pc->pmat,pc->flag);CHKERRQ(ierr);
535   }
536   ierr = PCSetFromOptions(red->pc);CHKERRQ(ierr);
537   ierr = PCSetUp(red->pc);CHKERRQ(ierr);
538   PetscFunctionReturn(0);
539 }
540 
541 #undef __FUNCT__
542 #define __FUNCT__ "PCApply_Redundant"
543 static PetscErrorCode PCApply_Redundant(PC pc,Vec x,Vec y)
544 {
545   PC_Redundant   *red = (PC_Redundant*)pc->data;
546   PetscErrorCode ierr;
547   PetscScalar    *array;
548 
549   PetscFunctionBegin;
550   /* scatter x to xdup */
551   ierr = VecScatterBegin(x,red->xdup,INSERT_VALUES,SCATTER_FORWARD,red->scatterin);CHKERRQ(ierr);
552   ierr = VecScatterEnd(x,red->xdup,INSERT_VALUES,SCATTER_FORWARD,red->scatterin);CHKERRQ(ierr);
553 
554   /* place xdup's local array into xsub */
555   ierr = VecGetArray(red->xdup,&array);CHKERRQ(ierr);
556   ierr = VecPlaceArray(red->xsub,(const PetscScalar*)array);CHKERRQ(ierr);
557 
558   /* apply preconditioner on each processor */
559   ierr = PCApply(red->pc,red->xsub,red->ysub);CHKERRQ(ierr);
560   ierr = VecResetArray(red->xsub);CHKERRQ(ierr);
561   ierr = VecRestoreArray(red->xdup,&array);CHKERRQ(ierr);
562 
563   /* place ysub's local array into ydup */
564   ierr = VecGetArray(red->ysub,&array);CHKERRQ(ierr);
565   ierr = VecPlaceArray(red->ydup,(const PetscScalar*)array);CHKERRQ(ierr);
566 
567   /* scatter ydup to y */
568   ierr = VecScatterBegin(red->ydup,y,INSERT_VALUES,SCATTER_FORWARD,red->scatterout);CHKERRQ(ierr);
569   ierr = VecScatterEnd(red->ydup,y,INSERT_VALUES,SCATTER_FORWARD,red->scatterout);CHKERRQ(ierr);
570   ierr = VecResetArray(red->ydup);CHKERRQ(ierr);
571   ierr = VecRestoreArray(red->ysub,&array);CHKERRQ(ierr);
572   PetscFunctionReturn(0);
573 }
574 
575 #undef __FUNCT__
576 #define __FUNCT__ "PCDestroy_Redundant"
577 static PetscErrorCode PCDestroy_Redundant(PC pc)
578 {
579   PC_Redundant   *red = (PC_Redundant*)pc->data;
580   PetscErrorCode ierr;
581 
582   PetscFunctionBegin;
583   if (red->scatterin)  {ierr = VecScatterDestroy(red->scatterin);CHKERRQ(ierr);}
584   if (red->scatterout) {ierr = VecScatterDestroy(red->scatterout);CHKERRQ(ierr);}
585   if (red->ysub)       {ierr = VecDestroy(red->ysub);CHKERRQ(ierr);}
586   if (red->xsub)       {ierr = VecDestroy(red->xsub);CHKERRQ(ierr);}
587   if (red->xdup)       {ierr = VecDestroy(red->xdup);CHKERRQ(ierr);}
588   if (red->ydup)       {ierr = VecDestroy(red->ydup);CHKERRQ(ierr);}
589   if (red->pmats) {
590     ierr = MatDestroy(red->pmats);CHKERRQ(ierr);
591   }
592 
593 
594   ierr = PCDestroy(red->pc);CHKERRQ(ierr);
595   ierr = PetscFree(red);CHKERRQ(ierr);
596   PetscFunctionReturn(0);
597 }
598 
599 #undef __FUNCT__
600 #define __FUNCT__ "PCSetFromOptions_Redundant"
601 static PetscErrorCode PCSetFromOptions_Redundant(PC pc)
602 {
603   PetscErrorCode ierr;
604   PC_Redundant   *red = (PC_Redundant*)pc->data;
605 
606   PetscFunctionBegin;
607   ierr = PetscOptionsHead("Redundant options");CHKERRQ(ierr);
608   ierr = PetscOptionsInt("-pc_redundant_number_comm","Number of subcommunicators","PCRedundantSetNumComm",red->nsubcomm,&red->nsubcomm,0);CHKERRQ(ierr);
609   ierr = PetscOptionsTail();CHKERRQ(ierr);
610   PetscFunctionReturn(0);
611 }
612 
613 EXTERN_C_BEGIN
614 #undef __FUNCT__
615 #define __FUNCT__ "PCRedundantSetScatter_Redundant"
616 PetscErrorCode PETSCKSP_DLLEXPORT PCRedundantSetScatter_Redundant(PC pc,VecScatter in,VecScatter out)
617 {
618   PC_Redundant   *red = (PC_Redundant*)pc->data;
619   PetscErrorCode ierr;
620 
621   PetscFunctionBegin;
622   red->scatterin  = in;
623   red->scatterout = out;
624   ierr = PetscObjectReference((PetscObject)in);CHKERRQ(ierr);
625   ierr = PetscObjectReference((PetscObject)out);CHKERRQ(ierr);
626   PetscFunctionReturn(0);
627 }
628 EXTERN_C_END
629 
630 #undef __FUNCT__
631 #define __FUNCT__ "PCRedundantSetScatter"
632 /*@
633    PCRedundantSetScatter - Sets the scatter used to copy values into the
634      redundant local solve and the scatter to move them back into the global
635      vector.
636 
637    Collective on PC
638 
639    Input Parameters:
640 +  pc - the preconditioner context
641 .  in - the scatter to move the values in
642 -  out - the scatter to move them out
643 
644    Level: advanced
645 
646 .keywords: PC, redundant solve
647 @*/
648 PetscErrorCode PETSCKSP_DLLEXPORT PCRedundantSetScatter(PC pc,VecScatter in,VecScatter out)
649 {
650   PetscErrorCode ierr,(*f)(PC,VecScatter,VecScatter);
651 
652   PetscFunctionBegin;
653   PetscValidHeaderSpecific(pc,PC_COOKIE,1);
654   PetscValidHeaderSpecific(in,VEC_SCATTER_COOKIE,2);
655   PetscValidHeaderSpecific(out,VEC_SCATTER_COOKIE,3);
656   ierr = PetscObjectQueryFunction((PetscObject)pc,"PCRedundantSetScatter_C",(void (**)(void))&f);CHKERRQ(ierr);
657   if (f) {
658     ierr = (*f)(pc,in,out);CHKERRQ(ierr);
659   }
660   PetscFunctionReturn(0);
661 }
662 
663 EXTERN_C_BEGIN
664 #undef __FUNCT__
665 #define __FUNCT__ "PCRedundantGetPC_Redundant"
666 PetscErrorCode PETSCKSP_DLLEXPORT PCRedundantGetPC_Redundant(PC pc,PC *innerpc)
667 {
668   PC_Redundant *red = (PC_Redundant*)pc->data;
669 
670   PetscFunctionBegin;
671   *innerpc = red->pc;
672   PetscFunctionReturn(0);
673 }
674 EXTERN_C_END
675 
676 #undef __FUNCT__
677 #define __FUNCT__ "PCRedundantGetPC"
678 /*@
679    PCRedundantGetPC - Gets the sequential PC created by the redundant PC.
680 
681    Not Collective
682 
683    Input Parameter:
684 .  pc - the preconditioner context
685 
686    Output Parameter:
687 .  innerpc - the sequential PC
688 
689    Level: advanced
690 
691 .keywords: PC, redundant solve
692 @*/
693 PetscErrorCode PETSCKSP_DLLEXPORT PCRedundantGetPC(PC pc,PC *innerpc)
694 {
695   PetscErrorCode ierr,(*f)(PC,PC*);
696 
697   PetscFunctionBegin;
698   PetscValidHeaderSpecific(pc,PC_COOKIE,1);
699   PetscValidPointer(innerpc,2);
700   ierr = PetscObjectQueryFunction((PetscObject)pc,"PCRedundantGetPC_C",(void (**)(void))&f);CHKERRQ(ierr);
701   if (f) {
702     ierr = (*f)(pc,innerpc);CHKERRQ(ierr);
703   }
704   PetscFunctionReturn(0);
705 }
706 
707 EXTERN_C_BEGIN
708 #undef __FUNCT__
709 #define __FUNCT__ "PCRedundantGetOperators_Redundant"
710 PetscErrorCode PETSCKSP_DLLEXPORT PCRedundantGetOperators_Redundant(PC pc,Mat *mat,Mat *pmat)
711 {
712   PC_Redundant *red = (PC_Redundant*)pc->data;
713 
714   PetscFunctionBegin;
715   if (mat)  *mat  = red->pmats;
716   if (pmat) *pmat = red->pmats;
717   PetscFunctionReturn(0);
718 }
719 EXTERN_C_END
720 
721 #undef __FUNCT__
722 #define __FUNCT__ "PCRedundantGetOperators"
723 /*@
724    PCRedundantGetOperators - gets the sequential matrix and preconditioner matrix
725 
726    Not Collective
727 
728    Input Parameter:
729 .  pc - the preconditioner context
730 
731    Output Parameters:
732 +  mat - the matrix
733 -  pmat - the (possibly different) preconditioner matrix
734 
735    Level: advanced
736 
737 .keywords: PC, redundant solve
738 @*/
739 PetscErrorCode PETSCKSP_DLLEXPORT PCRedundantGetOperators(PC pc,Mat *mat,Mat *pmat)
740 {
741   PetscErrorCode ierr,(*f)(PC,Mat*,Mat*);
742 
743   PetscFunctionBegin;
744   PetscValidHeaderSpecific(pc,PC_COOKIE,1);
745   if (mat)  PetscValidPointer(mat,2);
746   if (pmat) PetscValidPointer(pmat,3);
747   ierr = PetscObjectQueryFunction((PetscObject)pc,"PCRedundantGetOperators_C",(void (**)(void))&f);CHKERRQ(ierr);
748   if (f) {
749     ierr = (*f)(pc,mat,pmat);CHKERRQ(ierr);
750   }
751   PetscFunctionReturn(0);
752 }
753 
754 /* -------------------------------------------------------------------------------------*/
755 /*MC
756      PCREDUNDANT - Runs a preconditioner for the entire problem on subgroups of processors
757 
758 
759      Options for the redundant preconditioners can be set with -redundant_pc_xxx
760 
761   Options Database:
762 .  -pc_redundant_number_comm - number of sub communicators to use
763 
764    Level: intermediate
765 
766 
767 .seealso:  PCCreate(), PCSetType(), PCType (for list of available types), PCRedundantSetScatter(),
768            PCRedundantGetPC(), PCRedundantGetOperators()
769 
770 M*/
771 
772 EXTERN_C_BEGIN
773 #undef __FUNCT__
774 #define __FUNCT__ "PCCreate_Redundant"
775 PetscErrorCode PETSCKSP_DLLEXPORT PCCreate_Redundant(PC pc)
776 {
777   PetscErrorCode ierr;
778   PC_Redundant   *red;
779   const char     *prefix;
780   PetscInt       nsubcomm,np_subcomm,nleftover,i,j,color;
781   PetscMPIInt    rank,size,subrank,*subsize,duprank;
782   MPI_Comm       subcomm=0,dupcomm=0;
783 
784   PetscFunctionBegin;
785   ierr = PetscNew(PC_Redundant,&red);CHKERRQ(ierr);
786   ierr = PetscLogObjectMemory(pc,sizeof(PC_Redundant));CHKERRQ(ierr);
787   red->useparallelmat   = PETSC_TRUE;
788 
789   ierr = MPI_Comm_rank(pc->comm,&rank);CHKERRQ(ierr);
790   ierr = MPI_Comm_size(pc->comm,&size);CHKERRQ(ierr);
791   nsubcomm = size;
792   ierr = PetscOptionsGetInt(PETSC_NULL,"-pc_redundant_number_comm",&nsubcomm,PETSC_NULL);CHKERRQ(ierr);
793   if (nsubcomm > size) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE, "Num of subcommunicators %D cannot be larger than MPI_Comm_size %D",nsubcomm,size);
794 
795   /*--------------------------------------------------------------------------------------------------
796     To avoid data scattering from subcomm back to original comm, we create subcomm by iteratively taking a
797     processe into a subcomm.
798     An example: size=4, nsubcomm=3
799      pc->comm:
800       rank:     [0]  [1]  [2]  [3]
801       color:     0    1    2    0
802 
803      subcomm:
804       subrank:  [0]  [0]  [0]  [1]
805 
806      dupcomm:
807       duprank:  [0]  [2]  [3]  [1]
808 
809      Here, subcomm[color = 0] has subsize=2, owns process [0] and [3]
810            subcomm[color = 1] has subsize=1, owns process [1]
811            subcomm[color = 2] has subsize=1, owns process [2]
812           dupcomm has same number of processes as pc->comm, and its duprank maps
813           processes in subcomm contiguously into a 1d array:
814             duprank: [0] [1]      [2]         [3]
815             rank:    [0] [3]      [1]         [2]
816                     subcomm[0] subcomm[1]  subcomm[2]
817    ----------------------------------------------------------------------------------------*/
818 
819   /* get size of each subcommunicators */
820   ierr = PetscMalloc((1+nsubcomm)*sizeof(PetscMPIInt),&subsize);CHKERRQ(ierr);
821   np_subcomm = size/nsubcomm;
822   nleftover  = size - nsubcomm*np_subcomm;
823   for (i=0; i<nsubcomm; i++){
824     subsize[i] = np_subcomm;
825     if (i<nleftover) subsize[i]++;
826   }
827 
828   /* find color for this proc */
829   color   = rank%nsubcomm;
830   subrank = rank/nsubcomm;
831 
832   ierr = MPI_Comm_split(pc->comm,color,subrank,&subcomm);CHKERRQ(ierr);
833   red->subcomm  = subcomm;
834   red->color    = color;
835   red->nsubcomm = nsubcomm;
836 
837   j = 0; duprank = 0;
838   for (i=0; i<nsubcomm; i++){
839     if (j == color){
840       duprank += subrank;
841       break;
842     }
843     duprank += subsize[i]; j++;
844   }
845 
846   /* create dupcomm with same size as comm, but its rank, duprank, maps subcomm's contiguously into dupcomm */
847   ierr = MPI_Comm_split(pc->comm,0,duprank,&dupcomm);CHKERRQ(ierr);
848   red->dupcomm = dupcomm;
849   ierr = PetscFree(subsize);CHKERRQ(ierr);
850 
851   /* create the sequential PC that each processor has copy of */
852   ierr = PCCreate(subcomm,&red->pc);CHKERRQ(ierr);
853   ierr = PCSetType(red->pc,PCLU);CHKERRQ(ierr);
854   ierr = PCGetOptionsPrefix(pc,&prefix);CHKERRQ(ierr);
855   ierr = PCSetOptionsPrefix(red->pc,prefix);CHKERRQ(ierr);
856   ierr = PCAppendOptionsPrefix(red->pc,"redundant_");CHKERRQ(ierr);
857 
858   pc->ops->apply             = PCApply_Redundant;
859   pc->ops->applytranspose    = 0;
860   pc->ops->setup             = PCSetUp_Redundant;
861   pc->ops->destroy           = PCDestroy_Redundant;
862   pc->ops->setfromoptions    = PCSetFromOptions_Redundant;
863   pc->ops->setuponblocks     = 0;
864   pc->ops->view              = PCView_Redundant;
865   pc->ops->applyrichardson   = 0;
866 
867   pc->data     = (void*)red;
868 
869   ierr = PetscObjectComposeFunctionDynamic((PetscObject)pc,"PCRedundantSetScatter_C","PCRedundantSetScatter_Redundant",
870                     PCRedundantSetScatter_Redundant);CHKERRQ(ierr);
871   ierr = PetscObjectComposeFunctionDynamic((PetscObject)pc,"PCRedundantGetPC_C","PCRedundantGetPC_Redundant",
872                     PCRedundantGetPC_Redundant);CHKERRQ(ierr);
873   ierr = PetscObjectComposeFunctionDynamic((PetscObject)pc,"PCRedundantGetOperators_C","PCRedundantGetOperators_Redundant",
874                     PCRedundantGetOperators_Redundant);CHKERRQ(ierr);
875 
876   PetscFunctionReturn(0);
877 }
878 EXTERN_C_END
879