xref: /petsc/src/ksp/pc/impls/redundant/redundant.c (revision 3f457be154c925b43e77babd097a01a246d09d34)
1 #define PETSCKSP_DLL
2 
3 /*
4   This file defines a "solve the problem redundantly on each subgroup of processor" preconditioner.
5 */
6 #include "private/pcimpl.h"     /*I "petscpc.h" I*/
7 #include "petscksp.h"
8 
9 #undef CONTIGUOUS_COLOR
10 #define INTER_COLOR
11 
12 typedef struct {
13   PC         pc;                   /* actual preconditioner used on each processor */
14   Vec        xsub,ysub;            /* vectors of a subcommunicator to hold parallel vectors of pc->comm */
15   Vec        xdup,ydup;            /* parallel vector that congregates xsub or ysub facilitating vector scattering */
16   Mat        *pmats;               /* matrix and optional preconditioner matrix */
17   Mat        pmats_sub;            /* matrix and optional preconditioner matrix */
18   VecScatter scatterin,scatterout; /* scatter used to move all values to each processor group (subcommunicator) */
19   PetscTruth useparallelmat;
20   MPI_Comm   subcomm;              /* processors belong to a subcommunicator implement a PC in parallel */
21   MPI_Comm   dupcomm;              /* processors belong to pc->comm with their rank remapped in the way
22                                       that vector xdup/ydup has contiguous rank while appending xsub/ysub along their colors */
23   PetscInt   nsubcomm;             /* num of subcommunicators, which equals the num of redundant matrix systems */
24   PetscInt   color;                /* color of processors in a subcommunicator */
25 } PC_Redundant;
26 
27 #undef __FUNCT__
28 #define __FUNCT__ "PCView_Redundant"
29 static PetscErrorCode PCView_Redundant(PC pc,PetscViewer viewer)
30 {
31   PC_Redundant   *red = (PC_Redundant*)pc->data;
32   PetscErrorCode ierr;
33   PetscMPIInt    rank;
34   PetscTruth     iascii,isstring;
35   PetscViewer    sviewer;
36 
37   PetscFunctionBegin;
38   ierr = MPI_Comm_rank(pc->comm,&rank);CHKERRQ(ierr);
39   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_ASCII,&iascii);CHKERRQ(ierr);
40   ierr = PetscTypeCompare((PetscObject)viewer,PETSC_VIEWER_STRING,&isstring);CHKERRQ(ierr);
41   if (iascii) {
42     ierr = PetscViewerASCIIPrintf(viewer,"  Redundant solver preconditioner: Actual PC follows\n");CHKERRQ(ierr);
43     ierr = PetscViewerGetSingleton(viewer,&sviewer);CHKERRQ(ierr);
44     if (!rank) {
45       ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
46       ierr = PCView(red->pc,sviewer);CHKERRQ(ierr);
47       ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
48     }
49     ierr = PetscViewerRestoreSingleton(viewer,&sviewer);CHKERRQ(ierr);
50   } else if (isstring) {
51     ierr = PetscViewerStringSPrintf(viewer," Redundant solver preconditioner");CHKERRQ(ierr);
52     ierr = PetscViewerGetSingleton(viewer,&sviewer);CHKERRQ(ierr);
53     if (!rank) {
54       ierr = PCView(red->pc,sviewer);CHKERRQ(ierr);
55     }
56     ierr = PetscViewerRestoreSingleton(viewer,&sviewer);CHKERRQ(ierr);
57   } else {
58     SETERRQ1(PETSC_ERR_SUP,"Viewer type %s not supported for PC redundant",((PetscObject)viewer)->type_name);
59   }
60   PetscFunctionReturn(0);
61 }
62 
63 #include "src/mat/matimpl.h"        /*I "petscmat.h" I*/
64 #include "private/vecimpl.h"
65 #include "src/mat/impls/aij/mpi/mpiaij.h"   /*I "petscmat.h" I*/
66 #include "src/mat/impls/aij/seq/aij.h"      /*I "petscmat.h" I*/
67 
68 #undef __FUNCT__
69 #define __FUNCT__ "MatGetRedundantMatrix"
70 PetscErrorCode MatGetRedundantMatrix_AIJ(PC pc,Mat mat,MPI_Comm subcomm,PetscInt mlocal_sub,Mat *matredundant)
71 {
72   PetscMPIInt    rank,size,subrank,subsize;
73   MPI_Comm       comm=mat->comm;
74   PetscErrorCode ierr;
75   PC_Redundant   *red=(PC_Redundant*)pc->data;
76   PetscInt       nsubcomm=red->nsubcomm,nsends,nrecvs,i,prid=100,itmp;
77   PetscMPIInt    *send_rank,*recv_rank;
78   PetscInt       *rowrange=pc->pmat->rmap.range,mlocal_max,nzlocal;
79   Mat_MPIAIJ     *aij = (Mat_MPIAIJ*)mat->data;
80   Mat            A=aij->A,B=aij->B;
81   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)B->data;
82   Mat            C;
83 
84   PetscInt       nleftover,np_subcomm,j;
85   PetscInt       nz_A,nz_B,*sbuf_j;
86   PetscScalar    *sbuf_a;
87   PetscInt       cstart=mat->cmap.rstart,cend=mat->cmap.rend,row,nzA,nzB,ncols,*cworkA,*cworkB;
88   PetscInt       rstart=mat->rmap.rstart,rend=mat->rmap.rend,*bmap=aij->garray,M,N;
89   PetscInt       *cols,ctmp,lwrite,*rptr,l;
90   PetscScalar    *vals,*aworkA,*aworkB;
91 
92   PetscMPIInt    tag1,tag2,tag3,imdex;
93   MPI_Request    *s_waits1,*s_waits2,*s_waits3,*r_waits1,*r_waits2,*r_waits3;
94   MPI_Status     recv_status,*send_status;
95   PetscInt       *sbuf_nz,*rbuf_nz,count;
96 
97   PetscInt     **rbuf_j;
98   PetscScalar  **rbuf_a;
99 
100   PetscFunctionBegin;
101   ierr = PetscOptionsGetInt(PETSC_NULL,"-prid",&prid,PETSC_NULL);CHKERRQ(ierr);
102   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
103   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
104   ierr = MPI_Comm_rank(subcomm,&subrank);CHKERRQ(ierr);
105   ierr = MPI_Comm_size(subcomm,&subsize);CHKERRQ(ierr);
106   /*
107   ierr = PetscSynchronizedPrintf(comm, "[%d] subrank %d, subsize %d\n",rank,subrank,subsize);
108   ierr = PetscSynchronizedFlush(comm);CHKERRQ(ierr);
109   */
110   /* get the destination processors */
111   ierr = PetscMalloc((2*size+1)*sizeof(PetscMPIInt),&send_rank);
112   recv_rank = send_rank + size;
113   np_subcomm = size/nsubcomm;
114   nleftover  = size - nsubcomm*np_subcomm;
115   nsends = 0; nrecvs = 0;
116   for (i=0; i<size; i++){ /* i=rank*/
117     if (subrank == i/nsubcomm && rank != i){ /* my_subrank == other's subrank */
118       send_rank[nsends] = i; nsends++;
119       recv_rank[nrecvs++] = i;
120     }
121   }
122   if (rank >= size - nleftover){/* this proc is a leftover processor */
123     i = size-nleftover-1;
124     j = 0;
125     while (j < nsubcomm - nleftover){
126       send_rank[nsends++] = i;
127       i--; j++;
128     }
129   }
130 
131   if (nleftover && subsize == size/nsubcomm && subrank==subsize-1){ /* this proc recvs from leftover processors */
132     for (i=0; i<nleftover; i++){
133       recv_rank[nrecvs++] = size-nleftover+i;
134     }
135   }
136   if (prid == rank){
137     printf("[%d] sends to ",rank);
138     for (i=0; i<nsends; i++) printf(" [%d],",send_rank[i]);
139     printf("  \n");
140   }
141   /*
142   ierr = PetscSynchronizedPrintf(comm, "[%d] nsends %d, nrecvs %d\n",rank,nsends,nrecvs);
143   ierr = PetscSynchronizedFlush(comm);CHKERRQ(ierr);
144   ierr = MPI_Barrier(comm);CHKERRQ(ierr);
145   */
146 
147   /* get this processor's nzlocal=nz_A+nz_B */
148   nz_A = a->nz; nz_B = b->nz;
149   nzlocal = nz_A + nz_B;
150 
151   /* allocate sbuf_j, sbuf_a, then copy mat's local entries into the buffers */
152   itmp = nzlocal + rowrange[rank+1] - rowrange[rank] + 2;
153   ierr = PetscMalloc(itmp*sizeof(PetscInt),&sbuf_j);CHKERRQ(ierr);
154   ierr = PetscMalloc((nzlocal+1)*sizeof(PetscScalar),&sbuf_a);CHKERRQ(ierr);
155 
156     rptr = sbuf_j;
157     cols = sbuf_j + rend-rstart + 1;
158     vals = sbuf_a;
159     rptr[0] = 0;
160     for (i=0; i<rend-rstart; i++){
161       row = i + rstart;
162       if (rank == prid) printf(" \n row %d: ",row);
163       nzA    = a->i[i+1] - a->i[i]; nzB = b->i[i+1] - b->i[i];
164       ncols  = nzA + nzB;
165       cworkA = a->j + a->i[i]; cworkB = b->j + b->i[i];
166       aworkA = a->a + a->i[i]; aworkB = b->a + b->i[i];
167       /* load the column indices for this row into cols */
168       lwrite = 0;
169       for (l=0; l<nzB; l++) {
170         if ((ctmp = bmap[cworkB[l]]) < cstart){
171           vals[lwrite]   = aworkB[l];
172           cols[lwrite++] = ctmp;
173           if (rank == prid) printf(" (%d,%g)",ctmp,aworkB[l]);
174         }
175       }
176       for (l=0; l<nzA; l++){
177         vals[lwrite]   = aworkA[l];
178         cols[lwrite++] = cstart + cworkA[l];
179         if (rank == prid) printf(" (%d,%g)",cstart + cworkA[l],aworkA[l]);
180       }
181       for (l=0; l<nzB; l++) {
182         if ((ctmp = bmap[cworkB[l]]) >= cend){
183           vals[lwrite]   = aworkB[l];
184           cols[lwrite++] = ctmp;
185           if (rank == prid) printf(" (%d,%g)",ctmp,aworkB[l]);
186         }
187       }
188       /* insert local matrix values into C */
189       //ierr = MatSetValues(C,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
190 
191       vals += ncols;
192       cols += ncols;
193       rptr[i+1] = rptr[i] + ncols;
194     }
195     /*
196     ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
197     ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
198     */
199     if (rptr[rend-rstart] != a->nz + b->nz) SETERRQ4(1, "rptr[%d] %d != %d + %d",rend-rstart,rptr[rend-rstart+1],a->nz,b->nz);
200 
201   /* send nzlocal to others, and recv other's nzlocal */
202   /*--------------------------------------------------*/
203   ierr = PetscMalloc3(nsends+nrecvs+1,PetscInt,&sbuf_nz,nrecvs,PetscInt*,&rbuf_j,nrecvs,PetscScalar*,&rbuf_a);CHKERRQ(ierr);
204   rbuf_nz = sbuf_nz + nsends;
205 
206   ierr = PetscMalloc2(3*(nsends + nrecvs)+1,MPI_Request,&s_waits1,nsends+1,MPI_Status,&send_status);CHKERRQ(ierr);
207   s_waits2 = s_waits1 + nsends;
208   s_waits3 = s_waits2 + nsends;
209   r_waits1 = s_waits3 + nsends;
210   r_waits2 = r_waits1 + nrecvs;
211   r_waits3 = r_waits2 + nrecvs;
212 
213   /* get some new tags to keep the communication clean */
214   ierr = PetscObjectGetNewTag((PetscObject)A,&tag1);CHKERRQ(ierr);
215   ierr = PetscObjectGetNewTag((PetscObject)A,&tag2);CHKERRQ(ierr);
216   ierr = PetscObjectGetNewTag((PetscObject)A,&tag3);CHKERRQ(ierr);
217 
218   /* post receives of other's nzlocal */
219   for (i=0; i<nrecvs; i++){
220     ierr = MPI_Irecv(rbuf_nz+i,1,MPIU_INT,MPI_ANY_SOURCE,tag1,comm,r_waits1+i);CHKERRQ(ierr);
221   }
222 
223   /* send nzlocal to others */
224   for (i=0; i<nsends; i++){
225     sbuf_nz[i] = nzlocal;
226     ierr = MPI_Isend(sbuf_nz+i,1,MPIU_INT,send_rank[i],tag1,comm,s_waits1+i);CHKERRQ(ierr);
227     if (prid == rank) printf(" [%d] sends nz %d to [%d]\n",rank,nzlocal,send_rank[i]);
228   }
229 
230   /* wait on receives of nzlocal; allocate space for rbuf_j, rbuf_a */
231   count = nrecvs;
232   while (count) {
233     ierr = MPI_Waitany(nrecvs,r_waits1,&imdex,&recv_status);CHKERRQ(ierr);
234     recv_rank[imdex] = recv_status.MPI_SOURCE;
235     /* allocate rbuf_a and rbuf_j; then post receives of rbuf_a and rbuf_j */
236     ierr = PetscMalloc((rbuf_nz[imdex]+1)*sizeof(PetscScalar),&rbuf_a[imdex]);CHKERRQ(ierr);
237     ierr = MPI_Irecv(rbuf_a[imdex],rbuf_nz[imdex],MPIU_SCALAR,recv_status.MPI_SOURCE,tag3,comm,r_waits3+imdex);CHKERRQ(ierr);
238 
239     itmp = rowrange[recv_status.MPI_SOURCE+1] - rowrange[recv_status.MPI_SOURCE]; /* number of expected mat->i */
240     rbuf_nz[imdex] += itmp+2;
241     ierr = PetscMalloc(rbuf_nz[imdex]*sizeof(PetscInt),&rbuf_j[imdex]);CHKERRQ(ierr);
242     ierr = MPI_Irecv(rbuf_j[imdex],rbuf_nz[imdex],MPIU_INT,recv_status.MPI_SOURCE,tag2,comm,r_waits2+imdex);CHKERRQ(ierr);
243     count--;
244   }
245 
246   /* wait on sends of nzlocal */
247   if (nsends) {ierr = MPI_Waitall(nsends,s_waits1,send_status);CHKERRQ(ierr);}
248 
249   /* send mat->i,j and mat->a to others, and recv from other's */
250   /*-----------------------------------------------------------*/
251   for (i=0; i<nsends; i++){
252     ierr = MPI_Isend(sbuf_a,nzlocal,MPIU_SCALAR,send_rank[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
253     itmp = nzlocal + rowrange[rank+1] - rowrange[rank] + 1;
254     ierr = MPI_Isend(sbuf_j,itmp,MPIU_INT,send_rank[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
255   }
256 
257   /* wait on receives of mat->i,j and mat->a */
258   /*-----------------------------------------*/
259   count = nrecvs;
260   while (count) {
261     ierr = MPI_Waitany(nrecvs,r_waits3,&imdex,&recv_status);CHKERRQ(ierr);
262     if (recv_rank[imdex] != recv_status.MPI_SOURCE) SETERRQ2(1, "recv_rank %d != MPI_SOURCE %d",recv_rank[imdex],recv_status.MPI_SOURCE);
263     ierr = MPI_Waitany(nrecvs,r_waits2,&imdex,&recv_status);CHKERRQ(ierr);
264     count--;
265   }
266 
267   /* wait on sends of mat->i,j and mat->a */
268   /*--------------------------------------*/
269   if (nsends) {
270     ierr = MPI_Waitall(nsends,s_waits3,send_status);CHKERRQ(ierr);
271     ierr = MPI_Waitall(nsends,s_waits2,send_status);CHKERRQ(ierr);
272   }
273   ierr = PetscFree(sbuf_nz);CHKERRQ(ierr);
274   ierr = PetscFree2(s_waits1,send_status);CHKERRQ(ierr);
275 
276   /* create redundant matrix */
277   /*-------------------------*/
278   ierr = MatCreate(subcomm,&C);CHKERRQ(ierr);
279   ierr = MatSetSizes(C,mlocal_sub,mlocal_sub,PETSC_DECIDE,PETSC_DECIDE);CHKERRQ(ierr);
280   ierr = MatSetFromOptions(C);CHKERRQ(ierr);
281 
282   /* insert local matrix entries */
283   rptr = sbuf_j;
284   cols = sbuf_j + rend-rstart + 1;
285   vals = sbuf_a;
286   for (i=0; i<rend-rstart; i++){
287     row   = i + rstart;
288     ncols = rptr[i+1] - rptr[i];
289     ierr = MatSetValues(C,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
290     vals += ncols;
291     cols += ncols;
292   }
293   /* insert received matrix entries */
294   for (imdex=0; imdex<nrecvs; imdex++){
295 
296     rstart = rowrange[recv_rank[imdex]];
297     rend   = rowrange[recv_rank[imdex]+1];
298     rptr = rbuf_j[imdex];
299     cols = rbuf_j[imdex] + rend-rstart + 1;
300     vals = rbuf_a[imdex];
301     for (i=0; i<rend-rstart; i++){
302       row   = i + rstart;
303       ncols = rptr[i+1] - rptr[i];
304       ierr = MatSetValues(C,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
305       vals += ncols;
306       cols += ncols;
307     }
308   }
309   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
310   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
311   ierr = MatGetSize(C,&M,&N);CHKERRQ(ierr);
312   if (M != mat->rmap.N || N != mat->cmap.N) SETERRQ2(PETSC_ERR_ARG_INCOMP,"redundant mat size %d != input mat size %d",M,mat->rmap.N);
313   *matredundant = C;
314 
315   /* free space */
316   ierr = PetscFree(send_rank);CHKERRQ(ierr);
317   ierr = PetscFree(sbuf_j);CHKERRQ(ierr);
318   ierr = PetscFree(sbuf_a);CHKERRQ(ierr);
319   for (i=0; i<nrecvs; i++){
320     ierr = PetscFree(rbuf_j[i]);CHKERRQ(ierr);
321     ierr = PetscFree(rbuf_a[i]);CHKERRQ(ierr);
322   }
323   ierr = PetscFree3(sbuf_nz,rbuf_j,rbuf_a);CHKERRQ(ierr);
324   PetscFunctionReturn(0);
325 }
326 
327 #undef __FUNCT__
328 #define __FUNCT__ "PCSetUp_Redundant"
329 static PetscErrorCode PCSetUp_Redundant(PC pc)
330 {
331   PC_Redundant   *red  = (PC_Redundant*)pc->data;
332   PetscErrorCode ierr;
333   PetscInt       mstart,mend,mlocal,m;
334   PetscMPIInt    size;
335   IS             isl;
336   MatReuse       reuse = MAT_INITIAL_MATRIX;
337   MatStructure   str   = DIFFERENT_NONZERO_PATTERN;
338   MPI_Comm       comm;
339   Vec            vec;
340 
341   PetscMPIInt rank,size_sub,itmp;
342   PetscInt    mlocal_sub;
343   PetscMPIInt subsize,subrank;
344   PetscInt    rstart_sub,rend_sub,mloc_sub;
345   Mat         Aredundant;
346 
347   PetscFunctionBegin;
348   ierr = PetscObjectGetComm((PetscObject)pc->pmat,&comm);CHKERRQ(ierr);
349   ierr = MatGetVecs(pc->pmat,&vec,0);CHKERRQ(ierr);
350   ierr = PCSetFromOptions(red->pc);CHKERRQ(ierr);
351   ierr = VecGetSize(vec,&m);CHKERRQ(ierr);
352   if (!pc->setupcalled) {
353     /* create working vectors xsub/ysub and xdup/ydup */
354     ierr = VecGetLocalSize(vec,&mlocal);CHKERRQ(ierr);
355     ierr = VecGetOwnershipRange(vec,&mstart,&mend);CHKERRQ(ierr);
356 
357 #ifdef INTER_COLOR
358     /* get local size of xsub/ysub */
359     ierr = MPI_Comm_size(red->subcomm,&subsize);CHKERRQ(ierr);
360     ierr = MPI_Comm_rank(red->subcomm,&subrank);CHKERRQ(ierr);
361     rstart_sub = pc->pmat->rmap.range[red->nsubcomm*subrank]; /* rstart in xsub/ysub */
362     if (subrank+1 < subsize){
363       rend_sub = pc->pmat->rmap.range[red->nsubcomm*(subrank+1)];
364     } else {
365       rend_sub = m;
366     }
367     mloc_sub = rend_sub - rstart_sub;
368     ierr = VecCreateMPI(red->subcomm,mloc_sub,PETSC_DECIDE,&red->ysub);CHKERRQ(ierr);
369     /* create xsub with empty local arrays, because xdup's arrays will be placed into it */
370     ierr = VecCreateMPIWithArray(red->subcomm,mloc_sub,PETSC_DECIDE,PETSC_NULL,&red->xsub);CHKERRQ(ierr);
371 #endif
372 #ifdef CONTIGUOUS_COLOR
373     ierr = VecCreateMPI(red->subcomm,PETSC_DECIDE,m,&red->ysub);CHKERRQ(ierr);
374     ierr = VecGetLocalSize(red->ysub,&mloc_sub);CHKERRQ(ierr);
375     ierr = VecCreateMPIWithArray(red->subcomm,mloc_sub,m,PETSC_NULL,&red->xsub);CHKERRQ(ierr);
376 #endif
377 
378     /* create xdup and ydup. ydup has empty local arrays because ysub's arrays will be place into it.
379        Note: we use communicator dupcomm, not pc->comm! */
380     ierr = VecCreateMPI(red->dupcomm,mloc_sub,PETSC_DECIDE,&red->xdup);CHKERRQ(ierr);
381     ierr = VecCreateMPIWithArray(red->dupcomm,mloc_sub,PETSC_DECIDE,PETSC_NULL,&red->ydup);CHKERRQ(ierr);
382 
383     /* create vec scatters */
384     if (!red->scatterin){
385       IS       is1,is2;
386       PetscInt *idx1,*idx2,i,j,k;
387       ierr = PetscMalloc(2*red->nsubcomm*mlocal*sizeof(PetscInt),&idx1);CHKERRQ(ierr);
388       idx2 = idx1 + red->nsubcomm*mlocal;
389       j = 0;
390       for (k=0; k<red->nsubcomm; k++){
391         for (i=mstart; i<mend; i++){
392           idx1[j]   = i;
393           idx2[j++] = i + m*k;
394         }
395       }
396       ierr = ISCreateGeneral(comm,red->nsubcomm*mlocal,idx1,&is1);CHKERRQ(ierr);
397       ierr = ISCreateGeneral(comm,red->nsubcomm*mlocal,idx2,&is2);CHKERRQ(ierr);
398       ierr = VecScatterCreate(vec,is1,red->xdup,is2,&red->scatterin);CHKERRQ(ierr);
399       ierr = ISDestroy(is1);CHKERRQ(ierr);
400       ierr = ISDestroy(is2);CHKERRQ(ierr);
401 
402       ierr = ISCreateStride(comm,mlocal,mstart+ red->color*m,1,&is1);CHKERRQ(ierr);
403       ierr = ISCreateStride(comm,mlocal,mstart,1,&is2);CHKERRQ(ierr);
404       ierr = VecScatterCreate(red->xdup,is1,vec,is2,&red->scatterout);CHKERRQ(ierr);
405       ierr = ISDestroy(is1);CHKERRQ(ierr);
406       ierr = ISDestroy(is2);CHKERRQ(ierr);
407       ierr = PetscFree(idx1);CHKERRQ(ierr);
408     }
409   }
410   ierr = VecDestroy(vec);CHKERRQ(ierr);
411 
412   /* if pmatrix set by user is sequential then we do not need to gather the parallel matrix */
413   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
414   if (size == 1) {
415     red->useparallelmat = PETSC_FALSE;
416   }
417 
418   if (red->useparallelmat) {
419     if (pc->setupcalled == 1 && pc->flag == DIFFERENT_NONZERO_PATTERN) {
420       /* destroy old matrices */
421       if (red->pmats) {
422         ierr = MatDestroyMatrices(1,&red->pmats);CHKERRQ(ierr);
423       }
424     } else if (pc->setupcalled == 1) {
425       reuse = MAT_REUSE_MATRIX;
426       str   = SAME_NONZERO_PATTERN;
427     }
428 
429     /* ================== matrix ============= */
430     ierr = VecGetLocalSize(red->ysub,&mlocal_sub);CHKERRQ(ierr);
431     ierr = MatGetRedundantMatrix_AIJ(pc,pc->pmat,red->subcomm,mlocal_sub,&Aredundant);CHKERRQ(ierr);
432 
433     /* grab the parallel matrix and put it into processors of a subcomminicator */
434     ierr = ISCreateStride(PETSC_COMM_SELF,m,0,1,&isl);CHKERRQ(ierr);
435     ierr = MatGetSubMatrices(pc->pmat,1,&isl,&isl,reuse,&red->pmats);CHKERRQ(ierr);
436     ierr = ISDestroy(isl);CHKERRQ(ierr);
437 
438     /* ------- temporarily set a mpi matrix pmats_sub- provided by user! --*/
439     ierr = MatCreate(red->subcomm,&red->pmats_sub);CHKERRQ(ierr);
440     ierr = MatSetSizes(red->pmats_sub,mlocal_sub,mlocal_sub,PETSC_DECIDE,PETSC_DECIDE);CHKERRQ(ierr);
441     ierr = MatSetFromOptions(red->pmats_sub);CHKERRQ(ierr);
442     {
443       PetscInt          Istart,Iend,ncols,i;
444       const PetscInt    *cols;
445       const PetscScalar *vals;
446       PetscTruth flg;
447       ierr = MatGetOwnershipRange(red->pmats_sub,&Istart,&Iend);CHKERRQ(ierr);
448       for (i=Istart; i<Iend; i++) {
449         ierr = MatGetRow(red->pmats[0],i,&ncols,&cols,&vals);CHKERRQ(ierr);
450         ierr = MatSetValues(red->pmats_sub,1,&i,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
451         ierr = MatRestoreRow(red->pmats[0],i,&ncols,&cols,&vals);CHKERRQ(ierr);
452       }
453       ierr = MatAssemblyBegin(red->pmats_sub,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
454       ierr = MatAssemblyEnd(red->pmats_sub,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
455 
456       ierr = MatEqual(red->pmats_sub,Aredundant,&flg);CHKERRQ(ierr);
457       if (!flg) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"pmats_sub !=Aredundant ");
458       ierr = MatDestroy(red->pmats_sub);
459     }
460     red->pmats_sub = Aredundant;
461 
462     /* tell PC of the subcommunicator its operators */
463     ierr = PCSetOperators(red->pc,red->pmats_sub,red->pmats_sub,str);CHKERRQ(ierr);
464   } else {
465     ierr = PCSetOperators(red->pc,pc->mat,pc->pmat,pc->flag);CHKERRQ(ierr);
466   }
467   ierr = PCSetFromOptions(red->pc);CHKERRQ(ierr);
468   ierr = PCSetUp(red->pc);CHKERRQ(ierr);
469   PetscFunctionReturn(0);
470 }
471 
472 
473 #undef __FUNCT__
474 #define __FUNCT__ "PCApply_Redundant"
475 static PetscErrorCode PCApply_Redundant(PC pc,Vec x,Vec y)
476 {
477   PC_Redundant   *red = (PC_Redundant*)pc->data;
478   PetscErrorCode ierr;
479   PetscScalar    *array;
480 
481   PetscFunctionBegin;
482   /* scatter x to xdup */
483   ierr = VecScatterBegin(x,red->xdup,INSERT_VALUES,SCATTER_FORWARD,red->scatterin);CHKERRQ(ierr);
484   ierr = VecScatterEnd(x,red->xdup,INSERT_VALUES,SCATTER_FORWARD,red->scatterin);CHKERRQ(ierr);
485 
486   /* place xdup's local array into xsub */
487   ierr = VecGetArray(red->xdup,&array);CHKERRQ(ierr);
488   ierr = VecPlaceArray(red->xsub,(const PetscScalar*)array);CHKERRQ(ierr);
489 
490   /* apply preconditioner on each processor */
491   ierr = PCApply(red->pc,red->xsub,red->ysub);CHKERRQ(ierr);
492   ierr = VecResetArray(red->xsub);CHKERRQ(ierr);
493   ierr = VecRestoreArray(red->xdup,&array);CHKERRQ(ierr);
494 
495   /* place ysub's local array into ydup */
496   ierr = VecGetArray(red->ysub,&array);CHKERRQ(ierr);
497   ierr = VecPlaceArray(red->ydup,(const PetscScalar*)array);CHKERRQ(ierr);
498 
499   /* scatter ydup to y */
500   ierr = VecScatterBegin(red->ydup,y,INSERT_VALUES,SCATTER_FORWARD,red->scatterout);CHKERRQ(ierr);
501   ierr = VecScatterEnd(red->ydup,y,INSERT_VALUES,SCATTER_FORWARD,red->scatterout);CHKERRQ(ierr);
502   ierr = VecResetArray(red->ydup);CHKERRQ(ierr);
503   ierr = VecRestoreArray(red->ysub,&array);CHKERRQ(ierr);
504   PetscFunctionReturn(0);
505 }
506 
507 #undef __FUNCT__
508 #define __FUNCT__ "PCDestroy_Redundant"
509 static PetscErrorCode PCDestroy_Redundant(PC pc)
510 {
511   PC_Redundant   *red = (PC_Redundant*)pc->data;
512   PetscErrorCode ierr;
513 
514   PetscFunctionBegin;
515   if (red->scatterin)  {ierr = VecScatterDestroy(red->scatterin);CHKERRQ(ierr);}
516   if (red->scatterout) {ierr = VecScatterDestroy(red->scatterout);CHKERRQ(ierr);}
517   if (red->ysub)       {ierr = VecDestroy(red->ysub);CHKERRQ(ierr);}
518   if (red->xsub)       {ierr = VecDestroy(red->xsub);CHKERRQ(ierr);}
519   if (red->xdup)       {ierr = VecDestroy(red->xdup);CHKERRQ(ierr);}
520   if (red->ydup)       {ierr = VecDestroy(red->ydup);CHKERRQ(ierr);}
521   if (red->pmats_sub) {
522     ierr = MatDestroy(red->pmats_sub);CHKERRQ(ierr);
523   }
524 
525   if (red->pmats) {
526     ierr = MatDestroyMatrices(1,&red->pmats);CHKERRQ(ierr);
527   }
528   ierr = PCDestroy(red->pc);CHKERRQ(ierr);
529   ierr = PetscFree(red);CHKERRQ(ierr);
530   PetscFunctionReturn(0);
531 }
532 
533 #undef __FUNCT__
534 #define __FUNCT__ "PCSetFromOptions_Redundant"
535 static PetscErrorCode PCSetFromOptions_Redundant(PC pc)
536 {
537   PetscFunctionBegin;
538   PetscFunctionReturn(0);
539 }
540 
541 EXTERN_C_BEGIN
542 #undef __FUNCT__
543 #define __FUNCT__ "PCRedundantSetScatter_Redundant"
544 PetscErrorCode PETSCKSP_DLLEXPORT PCRedundantSetScatter_Redundant(PC pc,VecScatter in,VecScatter out)
545 {
546   PC_Redundant   *red = (PC_Redundant*)pc->data;
547   PetscErrorCode ierr;
548 
549   PetscFunctionBegin;
550   red->scatterin  = in;
551   red->scatterout = out;
552   ierr = PetscObjectReference((PetscObject)in);CHKERRQ(ierr);
553   ierr = PetscObjectReference((PetscObject)out);CHKERRQ(ierr);
554   PetscFunctionReturn(0);
555 }
556 EXTERN_C_END
557 
558 #undef __FUNCT__
559 #define __FUNCT__ "PCRedundantSetScatter"
560 /*@
561    PCRedundantSetScatter - Sets the scatter used to copy values into the
562      redundant local solve and the scatter to move them back into the global
563      vector.
564 
565    Collective on PC
566 
567    Input Parameters:
568 +  pc - the preconditioner context
569 .  in - the scatter to move the values in
570 -  out - the scatter to move them out
571 
572    Level: advanced
573 
574 .keywords: PC, redundant solve
575 @*/
576 PetscErrorCode PETSCKSP_DLLEXPORT PCRedundantSetScatter(PC pc,VecScatter in,VecScatter out)
577 {
578   PetscErrorCode ierr,(*f)(PC,VecScatter,VecScatter);
579 
580   PetscFunctionBegin;
581   PetscValidHeaderSpecific(pc,PC_COOKIE,1);
582   PetscValidHeaderSpecific(in,VEC_SCATTER_COOKIE,2);
583   PetscValidHeaderSpecific(out,VEC_SCATTER_COOKIE,3);
584   ierr = PetscObjectQueryFunction((PetscObject)pc,"PCRedundantSetScatter_C",(void (**)(void))&f);CHKERRQ(ierr);
585   if (f) {
586     ierr = (*f)(pc,in,out);CHKERRQ(ierr);
587   }
588   PetscFunctionReturn(0);
589 }
590 
591 EXTERN_C_BEGIN
592 #undef __FUNCT__
593 #define __FUNCT__ "PCRedundantGetPC_Redundant"
594 PetscErrorCode PETSCKSP_DLLEXPORT PCRedundantGetPC_Redundant(PC pc,PC *innerpc)
595 {
596   PC_Redundant *red = (PC_Redundant*)pc->data;
597 
598   PetscFunctionBegin;
599   *innerpc = red->pc;
600   PetscFunctionReturn(0);
601 }
602 EXTERN_C_END
603 
604 #undef __FUNCT__
605 #define __FUNCT__ "PCRedundantGetPC"
606 /*@
607    PCRedundantGetPC - Gets the sequential PC created by the redundant PC.
608 
609    Not Collective
610 
611    Input Parameter:
612 .  pc - the preconditioner context
613 
614    Output Parameter:
615 .  innerpc - the sequential PC
616 
617    Level: advanced
618 
619 .keywords: PC, redundant solve
620 @*/
621 PetscErrorCode PETSCKSP_DLLEXPORT PCRedundantGetPC(PC pc,PC *innerpc)
622 {
623   PetscErrorCode ierr,(*f)(PC,PC*);
624 
625   PetscFunctionBegin;
626   PetscValidHeaderSpecific(pc,PC_COOKIE,1);
627   PetscValidPointer(innerpc,2);
628   ierr = PetscObjectQueryFunction((PetscObject)pc,"PCRedundantGetPC_C",(void (**)(void))&f);CHKERRQ(ierr);
629   if (f) {
630     ierr = (*f)(pc,innerpc);CHKERRQ(ierr);
631   }
632   PetscFunctionReturn(0);
633 }
634 
635 EXTERN_C_BEGIN
636 #undef __FUNCT__
637 #define __FUNCT__ "PCRedundantGetOperators_Redundant"
638 PetscErrorCode PETSCKSP_DLLEXPORT PCRedundantGetOperators_Redundant(PC pc,Mat *mat,Mat *pmat)
639 {
640   PC_Redundant *red = (PC_Redundant*)pc->data;
641 
642   PetscFunctionBegin;
643   if (mat)  *mat  = red->pmats[0];
644   if (pmat) *pmat = red->pmats[0];
645   PetscFunctionReturn(0);
646 }
647 EXTERN_C_END
648 
649 #undef __FUNCT__
650 #define __FUNCT__ "PCRedundantGetOperators"
651 /*@
652    PCRedundantGetOperators - gets the sequential matrix and preconditioner matrix
653 
654    Not Collective
655 
656    Input Parameter:
657 .  pc - the preconditioner context
658 
659    Output Parameters:
660 +  mat - the matrix
661 -  pmat - the (possibly different) preconditioner matrix
662 
663    Level: advanced
664 
665 .keywords: PC, redundant solve
666 @*/
667 PetscErrorCode PETSCKSP_DLLEXPORT PCRedundantGetOperators(PC pc,Mat *mat,Mat *pmat)
668 {
669   PetscErrorCode ierr,(*f)(PC,Mat*,Mat*);
670 
671   PetscFunctionBegin;
672   PetscValidHeaderSpecific(pc,PC_COOKIE,1);
673   if (mat)  PetscValidPointer(mat,2);
674   if (pmat) PetscValidPointer(pmat,3);
675   ierr = PetscObjectQueryFunction((PetscObject)pc,"PCRedundantGetOperators_C",(void (**)(void))&f);CHKERRQ(ierr);
676   if (f) {
677     ierr = (*f)(pc,mat,pmat);CHKERRQ(ierr);
678   }
679   PetscFunctionReturn(0);
680 }
681 
682 /* -------------------------------------------------------------------------------------*/
683 /*MC
684      PCREDUNDANT - Runs a preconditioner for the entire problem on each processor
685 
686 
687      Options for the redundant preconditioners can be set with -redundant_pc_xxx
688 
689    Level: intermediate
690 
691 
692 .seealso:  PCCreate(), PCSetType(), PCType (for list of available types), PCRedundantSetScatter(),
693            PCRedundantGetPC(), PCRedundantGetOperators()
694 
695 M*/
696 
697 EXTERN_C_BEGIN
698 #undef __FUNCT__
699 #define __FUNCT__ "PCCreate_Redundant"
700 PetscErrorCode PETSCKSP_DLLEXPORT PCCreate_Redundant(PC pc)
701 {
702   PetscErrorCode ierr;
703   PC_Redundant   *red;
704   const char     *prefix;
705 
706   PetscInt       nsubcomm,np_subcomm,nleftover,i,j,color;
707   PetscMPIInt    rank,size,subrank,*subsize;
708   MPI_Comm       subcomm;
709   PetscMPIInt    duprank;
710   PetscMPIInt    rank_dup,size_dup;
711   MPI_Comm       dupcomm;
712 
713   PetscFunctionBegin;
714   ierr = PetscNew(PC_Redundant,&red);CHKERRQ(ierr);
715   ierr = PetscLogObjectMemory(pc,sizeof(PC_Redundant));CHKERRQ(ierr);
716   red->useparallelmat   = PETSC_TRUE;
717 
718   ierr = MPI_Comm_rank(pc->comm,&rank);CHKERRQ(ierr);
719   ierr = MPI_Comm_size(pc->comm,&size);CHKERRQ(ierr);
720   nsubcomm = size;
721   ierr = PetscOptionsGetInt(PETSC_NULL,"-nsubcomm",&nsubcomm,PETSC_NULL);CHKERRQ(ierr);
722   if (nsubcomm > size) SETERRQ2(PETSC_ERR_ARG_OUTOFRANGE, "Num of subcommunicators %D cannot be larger than MPI_Comm_size %D",nsubcomm,size);
723 
724   /* get size of each subcommunicators */
725   ierr = PetscMalloc((1+nsubcomm)*sizeof(PetscMPIInt),&subsize);CHKERRQ(ierr);
726   np_subcomm = size/nsubcomm;
727   nleftover  = size - nsubcomm*np_subcomm;
728   for (i=0; i<nsubcomm; i++){
729     subsize[i] = np_subcomm;
730     if (i<nleftover) subsize[i]++;
731   }
732 
733   /* find color for this proc */
734 #ifdef INTER_COLOR
735   color   = rank%nsubcomm;
736   subrank = rank/nsubcomm;
737 #endif
738 
739 #ifdef CONTIGUOUS_COLOR
740   color = 0; subrank = 0; i = 0; j=0;
741   while (i < size){
742     if (rank == i) break; /* my color is found */
743     if (j >= subsize[color]-1){ /* next subcomm */
744       j = -1; subrank = -1; color++;
745     }
746     i++; j++; subrank++;
747   }
748 #endif
749 
750   ierr = MPI_Comm_split(pc->comm,color,subrank,&subcomm);CHKERRQ(ierr);
751   red->subcomm  = subcomm;
752   red->color    = color;
753   red->nsubcomm = nsubcomm;
754 
755   j = 0; duprank = 0;
756   for (i=0; i<nsubcomm; i++){
757     if (j == color){
758       duprank += subrank;
759       break;
760     }
761     duprank += subsize[i]; j++;
762   }
763   /*
764   ierr = PetscSynchronizedPrintf(pc->comm, "[%d] color %d, subrank %d, duprank %d\n",rank,color,subrank,duprank);
765   ierr = PetscSynchronizedFlush(pc->comm);CHKERRQ(ierr);
766   */
767 
768   ierr = MPI_Comm_split(pc->comm,0,duprank,&dupcomm);CHKERRQ(ierr);
769   ierr = MPI_Comm_rank(dupcomm,&rank_dup);CHKERRQ(ierr);
770   ierr = MPI_Comm_size(dupcomm,&size_dup);CHKERRQ(ierr);
771   /*
772   ierr = PetscSynchronizedPrintf(pc->comm, "[%d] duprank %d\n",rank,duprank);
773   ierr = PetscSynchronizedFlush(pc->comm);CHKERRQ(ierr);
774   */
775   red->dupcomm = dupcomm;
776   ierr = PetscFree(subsize);CHKERRQ(ierr);
777 
778   /* create the sequential PC that each processor has copy of */
779   ierr = PCCreate(subcomm,&red->pc);CHKERRQ(ierr);
780   ierr = PCSetType(red->pc,PCLU);CHKERRQ(ierr);
781   ierr = PCGetOptionsPrefix(pc,&prefix);CHKERRQ(ierr);
782   ierr = PCSetOptionsPrefix(red->pc,prefix);CHKERRQ(ierr);
783   ierr = PCAppendOptionsPrefix(red->pc,"redundant_");CHKERRQ(ierr);
784 
785   pc->ops->apply             = PCApply_Redundant;
786   pc->ops->applytranspose    = 0;
787   pc->ops->setup             = PCSetUp_Redundant;
788   pc->ops->destroy           = PCDestroy_Redundant;
789   pc->ops->setfromoptions    = PCSetFromOptions_Redundant;
790   pc->ops->setuponblocks     = 0;
791   pc->ops->view              = PCView_Redundant;
792   pc->ops->applyrichardson   = 0;
793 
794   pc->data     = (void*)red;
795 
796   ierr = PetscObjectComposeFunctionDynamic((PetscObject)pc,"PCRedundantSetScatter_C","PCRedundantSetScatter_Redundant",
797                     PCRedundantSetScatter_Redundant);CHKERRQ(ierr);
798   ierr = PetscObjectComposeFunctionDynamic((PetscObject)pc,"PCRedundantGetPC_C","PCRedundantGetPC_Redundant",
799                     PCRedundantGetPC_Redundant);CHKERRQ(ierr);
800   ierr = PetscObjectComposeFunctionDynamic((PetscObject)pc,"PCRedundantGetOperators_C","PCRedundantGetOperators_Redundant",
801                     PCRedundantGetOperators_Redundant);CHKERRQ(ierr);
802 
803   PetscFunctionReturn(0);
804 }
805 EXTERN_C_END
806