xref: /petsc/src/ksp/pc/impls/is/nn/nn.c (revision 9c30b7d2697335155d7490a7e085415ee7b4a02a)
1 
2 #include "src/ksp/pc/impls/is/nn/nn.h"
3 
4 /* -------------------------------------------------------------------------- */
5 /*
6    PCSetUp_NN - Prepares for the use of the NN preconditioner
7                     by setting data structures and options.
8 
9    Input Parameter:
10 .  pc - the preconditioner context
11 
12    Application Interface Routine: PCSetUp()
13 
14    Notes:
15    The interface routine PCSetUp() is not usually called directly by
16    the user, but instead is called by PCApply() if necessary.
17 */
18 #undef __FUNCT__
19 #define __FUNCT__ "PCSetUp_NN"
20 static int PCSetUp_NN(PC pc)
21 {
22   int ierr;
23 
24   PetscFunctionBegin;
25   if (!pc->setupcalled) {
26     /* Set up all the "iterative substructuring" common block */
27     ierr = PCISSetUp(pc);CHKERRQ(ierr);
28     /* Create the coarse matrix. */
29     ierr = PCNNCreateCoarseMatrix(pc);CHKERRQ(ierr);
30   }
31   PetscFunctionReturn(0);
32 }
33 
34 /* -------------------------------------------------------------------------- */
35 /*
36    PCApply_NN - Applies the NN preconditioner to a vector.
37 
38    Input Parameters:
39 .  pc - the preconditioner context
40 .  r - input vector (global)
41 
42    Output Parameter:
43 .  z - output vector (global)
44 
45    Application Interface Routine: PCApply()
46  */
47 #undef __FUNCT__
48 #define __FUNCT__ "PCApply_NN"
49 static int PCApply_NN(PC pc,Vec r,Vec z)
50 {
51   PC_IS       *pcis = (PC_IS*)(pc->data);
52   int         ierr;
53   PetscScalar m_one = -1.0;
54   Vec         w = pcis->vec1_global;
55 
56   PetscFunctionBegin;
57 
58   /*
59     Dirichlet solvers.
60     Solving $ B_I^{(i)}r_I^{(i)} $ at each processor.
61     Storing the local results at vec2_D
62   */
63   ierr = VecScatterBegin(r,pcis->vec1_D,INSERT_VALUES,SCATTER_FORWARD,pcis->global_to_D);CHKERRQ(ierr);
64   ierr = VecScatterEnd  (r,pcis->vec1_D,INSERT_VALUES,SCATTER_FORWARD,pcis->global_to_D);CHKERRQ(ierr);
65   ierr = KSPSolve(pcis->ksp_D,pcis->vec1_D,pcis->vec2_D);CHKERRQ(ierr);
66 
67   /*
68     Computing $ r_B - \sum_j \tilde R_j^T A_{BI}^{(j)} (B_I^{(j)}r_I^{(j)}) $ .
69     Storing the result in the interface portion of the global vector w.
70   */
71   ierr = MatMult(pcis->A_BI,pcis->vec2_D,pcis->vec1_B);CHKERRQ(ierr);
72   ierr = VecScale(&m_one,pcis->vec1_B);CHKERRQ(ierr);
73   ierr = VecCopy(r,w);CHKERRQ(ierr);
74   ierr = VecScatterBegin(pcis->vec1_B,w,ADD_VALUES,SCATTER_REVERSE,pcis->global_to_B);CHKERRQ(ierr);
75   ierr = VecScatterEnd  (pcis->vec1_B,w,ADD_VALUES,SCATTER_REVERSE,pcis->global_to_B);CHKERRQ(ierr);
76 
77   /*
78     Apply the interface preconditioner
79   */
80   ierr = PCNNApplyInterfacePreconditioner(pc,w,z,pcis->work_N,pcis->vec1_B,pcis->vec2_B,pcis->vec3_B,pcis->vec1_D,
81                                           pcis->vec3_D,pcis->vec1_N,pcis->vec2_N);CHKERRQ(ierr);
82 
83   /*
84     Computing $ t_I^{(i)} = A_{IB}^{(i)} \tilde R_i z_B $
85     The result is stored in vec1_D.
86   */
87   ierr = VecScatterBegin(z,pcis->vec1_B,INSERT_VALUES,SCATTER_FORWARD,pcis->global_to_B);CHKERRQ(ierr);
88   ierr = VecScatterEnd  (z,pcis->vec1_B,INSERT_VALUES,SCATTER_FORWARD,pcis->global_to_B);CHKERRQ(ierr);
89   ierr = MatMult(pcis->A_IB,pcis->vec1_B,pcis->vec1_D);CHKERRQ(ierr);
90 
91   /*
92     Dirichlet solvers.
93     Computing $ B_I^{(i)}t_I^{(i)} $ and sticking into the global vector the blocks
94     $ B_I^{(i)}r_I^{(i)} - B_I^{(i)}t_I^{(i)} $.
95   */
96   ierr = VecScatterBegin(pcis->vec2_D,z,INSERT_VALUES,SCATTER_REVERSE,pcis->global_to_D);CHKERRQ(ierr);
97   ierr = VecScatterEnd  (pcis->vec2_D,z,INSERT_VALUES,SCATTER_REVERSE,pcis->global_to_D);CHKERRQ(ierr);
98   ierr = KSPSolve(pcis->ksp_D,pcis->vec1_D,pcis->vec2_D);CHKERRQ(ierr);
99   ierr = VecScale(&m_one,pcis->vec2_D);CHKERRQ(ierr);
100   ierr = VecScatterBegin(pcis->vec2_D,z,ADD_VALUES,SCATTER_REVERSE,pcis->global_to_D);CHKERRQ(ierr);
101   ierr = VecScatterEnd  (pcis->vec2_D,z,ADD_VALUES,SCATTER_REVERSE,pcis->global_to_D);CHKERRQ(ierr);
102 
103   PetscFunctionReturn(0);
104 }
105 
106 /* -------------------------------------------------------------------------- */
107 /*
108    PCDestroy_NN - Destroys the private context for the NN preconditioner
109    that was created with PCCreate_NN().
110 
111    Input Parameter:
112 .  pc - the preconditioner context
113 
114    Application Interface Routine: PCDestroy()
115 */
116 #undef __FUNCT__
117 #define __FUNCT__ "PCDestroy_NN"
118 static int PCDestroy_NN(PC pc)
119 {
120   PC_NN *pcnn = (PC_NN*)pc->data;
121   int   ierr;
122 
123   PetscFunctionBegin;
124 
125   ierr = PCISDestroy(pc);CHKERRQ(ierr);
126 
127   if (pcnn->coarse_mat)  {ierr = MatDestroy(pcnn->coarse_mat);CHKERRQ(ierr);}
128   if (pcnn->coarse_x)    {ierr = VecDestroy(pcnn->coarse_x);CHKERRQ(ierr);}
129   if (pcnn->coarse_b)    {ierr = VecDestroy(pcnn->coarse_b);CHKERRQ(ierr);}
130   if (pcnn->ksp_coarse) {ierr = KSPDestroy(pcnn->ksp_coarse);CHKERRQ(ierr);}
131   if (pcnn->DZ_IN) {
132     if (pcnn->DZ_IN[0]) {ierr = PetscFree(pcnn->DZ_IN[0]);CHKERRQ(ierr);}
133     ierr = PetscFree(pcnn->DZ_IN);CHKERRQ(ierr);
134   }
135 
136   /*
137       Free the private data structure that was hanging off the PC
138   */
139   ierr = PetscFree(pcnn);CHKERRQ(ierr);
140   PetscFunctionReturn(0);
141 }
142 
143 /* -------------------------------------------------------------------------- */
144 /*MC
145    PCNN - Balancing Neumann-Neumann for scalar elliptic PDEs.
146 
147    Options Database Keys:
148 +    -pc_nn_turn_off_first_balancing - do not balance the residual before solving the local Neumann problems
149                                        (this skips the first coarse grid solve in the preconditioner)
150 .    -pc_nn_turn_off_second_balancing - do not balance the solution solving the local Neumann problems
151                                        (this skips the second coarse grid solve in the preconditioner)
152 .    -pc_is_damp_fixed <fact> -
153 .    -pc_is_remove_nullspace_fixed -
154 .    -pc_is_set_damping_factor_floating <fact> -
155 .    -pc_is_not_damp_floating -
156 +    -pc_is_not_remove_nullspace_floating -
157 
158    Level: intermediate
159 
160    Notes: The matrix used with this preconditioner must be of type MATIS
161 
162           Unlike more 'conventional' Neumann-Neumann preconditioners this iterates over ALL the
163           degrees of freedom, NOT just those on the interface (this allows the use of approximate solvers
164           on the subdomains; though in our experience using approximate solvers is slower.).
165 
166           Options for the coarse grid preconditioner can be set with -nn_coarse_pc_xxx
167           Options for the Dirichlet subproblem preconditioner can be set with -is_localD_pc_xxx
168           Options for the Neumann subproblem preconditioner can be set with -is_localN_pc_xxx
169 
170    Contributed by Paulo Goldfeld
171 
172 .seealso:  PCCreate(), PCSetType(), PCType (for list of available types), PC,  MatIS
173 M*/
174 EXTERN_C_BEGIN
175 #undef __FUNCT__
176 #define __FUNCT__ "PCCreate_NN"
177 int PCCreate_NN(PC pc)
178 {
179   int   ierr;
180   PC_NN *pcnn;
181 
182   PetscFunctionBegin;
183 
184   /*
185      Creates the private data structure for this preconditioner and
186      attach it to the PC object.
187   */
188   ierr      = PetscNew(PC_NN,&pcnn);CHKERRQ(ierr);
189   pc->data  = (void*)pcnn;
190 
191   /*
192      Logs the memory usage; this is not needed but allows PETSc to
193      monitor how much memory is being used for various purposes.
194   */
195   PetscLogObjectMemory(pc,sizeof(PC_NN)+sizeof(PC_IS)); /* Is this the right thing to do? */
196 
197   ierr = PCISCreate(pc);CHKERRQ(ierr);
198   pcnn->coarse_mat  = 0;
199   pcnn->coarse_x    = 0;
200   pcnn->coarse_b    = 0;
201   pcnn->ksp_coarse = 0;
202   pcnn->DZ_IN       = 0;
203 
204   /*
205       Set the pointers for the functions that are provided above.
206       Now when the user-level routines (such as PCApply(), PCDestroy(), etc.)
207       are called, they will automatically call these functions.  Note we
208       choose not to provide a couple of these functions since they are
209       not needed.
210   */
211   pc->ops->apply               = PCApply_NN;
212   pc->ops->applytranspose      = 0;
213   pc->ops->setup               = PCSetUp_NN;
214   pc->ops->destroy             = PCDestroy_NN;
215   pc->ops->view                = 0;
216   pc->ops->applyrichardson     = 0;
217   pc->ops->applysymmetricleft  = 0;
218   pc->ops->applysymmetricright = 0;
219 
220   PetscFunctionReturn(0);
221 }
222 EXTERN_C_END
223 
224 
225 /* -------------------------------------------------------------------------- */
226 /*
227    PCNNCreateCoarseMatrix -
228 */
229 #undef __FUNCT__
230 #define __FUNCT__ "PCNNCreateCoarseMatrix"
231 int PCNNCreateCoarseMatrix (PC pc)
232 {
233   MPI_Request    *send_request, *recv_request;
234   int            i, j, k, ierr;
235 
236   PetscScalar*   mat;    /* Sub-matrix with this subdomain's contribution to the coarse matrix             */
237   PetscScalar**  DZ_OUT; /* proc[k].DZ_OUT[i][] = bit of vector to be sent from processor k to processor i */
238 
239   /* aliasing some names */
240   PC_IS*         pcis     = (PC_IS*)(pc->data);
241   PC_NN*         pcnn     = (PC_NN*)pc->data;
242   int            n_neigh  = pcis->n_neigh;
243   int*           neigh    = pcis->neigh;
244   int*           n_shared = pcis->n_shared;
245   int**          shared   = pcis->shared;
246   PetscScalar**  DZ_IN;   /* Must be initialized after memory allocation. */
247 
248   PetscFunctionBegin;
249 
250   /* Allocate memory for mat (the +1 is to handle the case n_neigh equal to zero) */
251   ierr = PetscMalloc((n_neigh*n_neigh+1)*sizeof(PetscScalar),&mat);CHKERRQ(ierr);
252 
253   /* Allocate memory for DZ */
254   /* Notice that DZ_OUT[0] is allocated some space that is never used. */
255   /* This is just in order to DZ_OUT and DZ_IN to have exactly the same form. */
256   {
257     int size_of_Z = 0;
258     ierr  = PetscMalloc ((n_neigh+1)*sizeof(PetscScalar*),&pcnn->DZ_IN);CHKERRQ(ierr);
259     DZ_IN = pcnn->DZ_IN;
260     ierr  = PetscMalloc ((n_neigh+1)*sizeof(PetscScalar*),&DZ_OUT);CHKERRQ(ierr);
261     for (i=0; i<n_neigh; i++) {
262       size_of_Z += n_shared[i];
263     }
264     ierr = PetscMalloc ((size_of_Z+1)*sizeof(PetscScalar),&DZ_IN[0]);CHKERRQ(ierr);
265     ierr = PetscMalloc ((size_of_Z+1)*sizeof(PetscScalar),&DZ_OUT[0]);CHKERRQ(ierr);
266   }
267   for (i=1; i<n_neigh; i++) {
268     DZ_IN[i]  = DZ_IN [i-1] + n_shared[i-1];
269     DZ_OUT[i] = DZ_OUT[i-1] + n_shared[i-1];
270   }
271 
272   /* Set the values of DZ_OUT, in order to send this info to the neighbours */
273   /* First, set the auxiliary array pcis->work_N. */
274   ierr = PCISScatterArrayNToVecB(pcis->work_N,pcis->D,INSERT_VALUES,SCATTER_REVERSE,pc);CHKERRQ(ierr);
275   for (i=1; i<n_neigh; i++){
276     for (j=0; j<n_shared[i]; j++) {
277       DZ_OUT[i][j] = pcis->work_N[shared[i][j]];
278     }
279   }
280 
281   /* Non-blocking send/receive the common-interface chunks of scaled nullspaces */
282   /* Notice that send_request[] and recv_request[] could have one less element. */
283   /* We make them longer to have request[i] corresponding to neigh[i].          */
284   {
285     int tag;
286     ierr = PetscObjectGetNewTag((PetscObject)pc,&tag);CHKERRQ(ierr);
287     ierr = PetscMalloc((2*(n_neigh)+1)*sizeof(MPI_Request),&send_request);CHKERRQ(ierr);
288     recv_request = send_request + (n_neigh);
289     for (i=1; i<n_neigh; i++) {
290       ierr = MPI_Isend((void*)(DZ_OUT[i]),n_shared[i],MPIU_SCALAR,neigh[i],tag,pc->comm,&(send_request[i]));CHKERRQ(ierr);
291       ierr = MPI_Irecv((void*)(DZ_IN [i]),n_shared[i],MPIU_SCALAR,neigh[i],tag,pc->comm,&(recv_request[i]));CHKERRQ(ierr);
292     }
293   }
294 
295   /* Set DZ_IN[0][] (recall that neigh[0]==rank, always) */
296   for(j=0; j<n_shared[0]; j++) {
297     DZ_IN[0][j] = pcis->work_N[shared[0][j]];
298   }
299 
300   /* Start computing with local D*Z while communication goes on.    */
301   /* Apply Schur complement. The result is "stored" in vec (more    */
302   /* precisely, vec points to the result, stored in pc_nn->vec1_B)  */
303   /* and also scattered to pcnn->work_N.                            */
304   ierr = PCNNApplySchurToChunk(pc,n_shared[0],shared[0],DZ_IN[0],pcis->work_N,pcis->vec1_B,
305                                pcis->vec2_B,pcis->vec1_D,pcis->vec2_D);CHKERRQ(ierr);
306 
307   /* Compute the first column, while completing the receiving. */
308   for (i=0; i<n_neigh; i++) {
309     MPI_Status stat;
310     int ind=0;
311     if (i>0) { ierr = MPI_Waitany(n_neigh-1,recv_request+1,&ind,&stat);CHKERRQ(ierr); ind++;}
312     mat[ind*n_neigh+0] = 0.0;
313     for (k=0; k<n_shared[ind]; k++) {
314       mat[ind*n_neigh+0] += DZ_IN[ind][k] * pcis->work_N[shared[ind][k]];
315     }
316   }
317 
318   /* Compute the remaining of the columns */
319   for (j=1; j<n_neigh; j++) {
320     ierr = PCNNApplySchurToChunk(pc,n_shared[j],shared[j],DZ_IN[j],pcis->work_N,pcis->vec1_B,
321                                  pcis->vec2_B,pcis->vec1_D,pcis->vec2_D);CHKERRQ(ierr);
322     for (i=0; i<n_neigh; i++) {
323       mat[i*n_neigh+j] = 0.0;
324       for (k=0; k<n_shared[i]; k++) {
325 	mat[i*n_neigh+j] += DZ_IN[i][k] * pcis->work_N[shared[i][k]];
326       }
327     }
328   }
329 
330   /* Complete the sending. */
331   if (n_neigh>1) {
332     MPI_Status *stat;
333     ierr = PetscMalloc((n_neigh-1)*sizeof(MPI_Status),&stat);CHKERRQ(ierr);
334     ierr = MPI_Waitall(n_neigh-1,&(send_request[1]),stat);CHKERRQ(ierr);
335     ierr = PetscFree(stat);CHKERRQ(ierr);
336   }
337 
338   /* Free the memory for the MPI requests */
339   ierr = PetscFree(send_request);CHKERRQ(ierr);
340 
341   /* Free the memory for DZ_OUT */
342   if (DZ_OUT) {
343     if (DZ_OUT[0]) { ierr = PetscFree(DZ_OUT[0]);CHKERRQ(ierr); }
344     ierr = PetscFree(DZ_OUT);CHKERRQ(ierr);
345   }
346 
347   {
348     int size,n_neigh_m1;
349     ierr = MPI_Comm_size(pc->comm,&size);CHKERRQ(ierr);
350     n_neigh_m1 = (n_neigh) ? n_neigh-1 : 0;
351     /* Create the global coarse vectors (rhs and solution). */
352     ierr = VecCreateMPI(pc->comm,1,size,&(pcnn->coarse_b));CHKERRQ(ierr);
353     ierr = VecDuplicate(pcnn->coarse_b,&(pcnn->coarse_x));CHKERRQ(ierr);
354     /* Create and set the global coarse AIJ matrix. */
355     ierr = MatCreate(pc->comm,1,1,size,size,&(pcnn->coarse_mat));CHKERRQ(ierr);
356     ierr = MatSetType(pcnn->coarse_mat,MATAIJ);CHKERRQ(ierr);
357     ierr = MatSeqAIJSetPreallocation(pcnn->coarse_mat,1,PETSC_NULL);CHKERRQ(ierr);
358     ierr = MatMPIAIJSetPreallocation(pcnn->coarse_mat,1,PETSC_NULL,1,PETSC_NULL);CHKERRQ(ierr);
359     ierr = MatSetValues(pcnn->coarse_mat,n_neigh,neigh,n_neigh,neigh,mat,ADD_VALUES);CHKERRQ(ierr);
360     ierr = MatAssemblyBegin(pcnn->coarse_mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
361     ierr = MatAssemblyEnd  (pcnn->coarse_mat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
362   }
363 
364   {
365     int         rank;
366     PetscScalar one = 1.0;
367     IS          is;
368     ierr = MPI_Comm_rank(pc->comm,&rank);CHKERRQ(ierr);
369     /* "Zero out" rows of not-purely-Neumann subdomains */
370     if (pcis->pure_neumann) {  /* does NOT zero the row; create an empty index set. The reason is that MatZeroRows() is collective. */
371       ierr = ISCreateStride(pc->comm,0,0,0,&is);CHKERRQ(ierr);
372     } else { /* here it DOES zero the row, since it's not a floating subdomain. */
373       ierr = ISCreateStride(pc->comm,1,rank,0,&is);CHKERRQ(ierr);
374     }
375     ierr = MatZeroRows(pcnn->coarse_mat,is,&one);CHKERRQ(ierr);
376     ierr = ISDestroy(is);CHKERRQ(ierr);
377   }
378 
379   /* Create the coarse linear solver context */
380   {
381     PC  pc_ctx, inner_pc;
382     ierr = KSPCreate(pc->comm,&pcnn->ksp_coarse);CHKERRQ(ierr);
383     ierr = KSPSetOperators(pcnn->ksp_coarse,pcnn->coarse_mat,pcnn->coarse_mat,SAME_PRECONDITIONER);CHKERRQ(ierr);
384     ierr = KSPGetPC(pcnn->ksp_coarse,&pc_ctx);CHKERRQ(ierr);
385     ierr = PCSetType(pc_ctx,PCREDUNDANT);CHKERRQ(ierr);
386     ierr = KSPSetType(pcnn->ksp_coarse,KSPPREONLY);CHKERRQ(ierr);
387     ierr = PCRedundantGetPC(pc_ctx,&inner_pc);CHKERRQ(ierr);
388     ierr = PCSetType(inner_pc,PCLU);CHKERRQ(ierr);
389     ierr = KSPSetOptionsPrefix(pcnn->ksp_coarse,"nn_coarse_");CHKERRQ(ierr);
390     ierr = KSPSetFromOptions(pcnn->ksp_coarse);CHKERRQ(ierr);
391     /* the vectors in the following line are dummy arguments, just telling the KSP the vector size. Values are not used */
392     ierr = KSPSetUp(pcnn->ksp_coarse);CHKERRQ(ierr);
393   }
394 
395   /* Free the memory for mat */
396   ierr = PetscFree(mat);CHKERRQ(ierr);
397 
398   /* for DEBUGGING, save the coarse matrix to a file. */
399   {
400     PetscTruth flg;
401     ierr = PetscOptionsHasName(PETSC_NULL,"-pc_nn_save_coarse_matrix",&flg);CHKERRQ(ierr);
402     if (flg) {
403       PetscViewer viewer;
404       ierr = PetscViewerASCIIOpen(PETSC_COMM_WORLD,"coarse.m",&viewer);CHKERRQ(ierr);
405       ierr = PetscViewerSetFormat(viewer,PETSC_VIEWER_ASCII_MATLAB);CHKERRQ(ierr);
406       ierr = MatView(pcnn->coarse_mat,viewer);CHKERRQ(ierr);
407       ierr = PetscViewerDestroy(viewer);CHKERRQ(ierr);
408     }
409   }
410 
411   /*  Set the variable pcnn->factor_coarse_rhs. */
412   pcnn->factor_coarse_rhs = (pcis->pure_neumann) ? 1.0 : 0.0;
413 
414   /* See historical note 02, at the bottom of this file. */
415 
416   PetscFunctionReturn(0);
417 }
418 
419 /* -------------------------------------------------------------------------- */
420 /*
421    PCNNApplySchurToChunk -
422 
423    Input parameters:
424 .  pcnn
425 .  n - size of chunk
426 .  idx - indices of chunk
427 .  chunk - values
428 
429    Output parameters:
430 .  array_N - result of Schur complement applied to chunk, scattered to big array
431 .  vec1_B  - result of Schur complement applied to chunk
432 .  vec2_B  - garbage (used as work space)
433 .  vec1_D  - garbage (used as work space)
434 .  vec2_D  - garbage (used as work space)
435 
436 */
437 #undef __FUNCT__
438 #define __FUNCT__ "PCNNApplySchurToChunk"
439 int PCNNApplySchurToChunk(PC pc, int n, int* idx, PetscScalar *chunk, PetscScalar* array_N, Vec vec1_B, Vec vec2_B, Vec vec1_D, Vec vec2_D)
440 {
441   int   i, ierr;
442   PC_IS *pcis = (PC_IS*)(pc->data);
443 
444   PetscFunctionBegin;
445 
446   ierr = PetscMemzero((void*)array_N, pcis->n*sizeof(PetscScalar));CHKERRQ(ierr);
447   for (i=0; i<n; i++) { array_N[idx[i]] = chunk[i]; }
448   ierr = PCISScatterArrayNToVecB(array_N,vec2_B,INSERT_VALUES,SCATTER_FORWARD,pc);CHKERRQ(ierr);
449   ierr = PCISApplySchur(pc,vec2_B,vec1_B,(Vec)0,vec1_D,vec2_D);CHKERRQ(ierr);
450   ierr = PCISScatterArrayNToVecB(array_N,vec1_B,INSERT_VALUES,SCATTER_REVERSE,pc);CHKERRQ(ierr);
451 
452   PetscFunctionReturn(0);
453 }
454 
455 /* -------------------------------------------------------------------------- */
456 /*
457    PCNNApplyInterfacePreconditioner - Apply the interface preconditioner, i.e.,
458                                       the preconditioner for the Schur complement.
459 
460    Input parameter:
461 .  r - global vector of interior and interface nodes. The values on the interior nodes are NOT used.
462 
463    Output parameters:
464 .  z - global vector of interior and interface nodes. The values on the interface are the result of
465        the application of the interface preconditioner to the interface part of r. The values on the
466        interior nodes are garbage.
467 .  work_N - array of local nodes (interior and interface, including ghosts); returns garbage (used as work space)
468 .  vec1_B - vector of local interface nodes (including ghosts); returns garbage (used as work space)
469 .  vec2_B - vector of local interface nodes (including ghosts); returns garbage (used as work space)
470 .  vec3_B - vector of local interface nodes (including ghosts); returns garbage (used as work space)
471 .  vec1_D - vector of local interior nodes; returns garbage (used as work space)
472 .  vec2_D - vector of local interior nodes; returns garbage (used as work space)
473 .  vec1_N - vector of local nodes (interior and interface, including ghosts); returns garbage (used as work space)
474 .  vec2_N - vector of local nodes (interior and interface, including ghosts); returns garbage (used as work space)
475 
476 */
477 #undef __FUNCT__
478 #define __FUNCT__ "PCNNApplyInterfacePreconditioner"
479 int PCNNApplyInterfacePreconditioner (PC pc, Vec r, Vec z, PetscScalar* work_N, Vec vec1_B, Vec vec2_B, Vec vec3_B, Vec vec1_D,
480                                       Vec vec2_D, Vec vec1_N, Vec vec2_N)
481 {
482   int    ierr;
483   PC_IS* pcis = (PC_IS*)(pc->data);
484 
485   PetscFunctionBegin;
486 
487   /*
488     First balancing step.
489   */
490   {
491     PetscTruth flg;
492     ierr = PetscOptionsHasName(PETSC_NULL,"-pc_nn_turn_off_first_balancing",&flg);CHKERRQ(ierr);
493     if (!flg) {
494       ierr = PCNNBalancing(pc,r,(Vec)0,z,vec1_B,vec2_B,(Vec)0,vec1_D,vec2_D,work_N);CHKERRQ(ierr);
495     } else {
496       ierr = VecCopy(r,z);CHKERRQ(ierr);
497     }
498   }
499 
500   /*
501     Extract the local interface part of z and scale it by D
502   */
503   ierr = VecScatterBegin(z,vec1_B,INSERT_VALUES,SCATTER_FORWARD,pcis->global_to_B);CHKERRQ(ierr);
504   ierr = VecScatterEnd  (z,vec1_B,INSERT_VALUES,SCATTER_FORWARD,pcis->global_to_B);CHKERRQ(ierr);
505   ierr = VecPointwiseMult(pcis->D,vec1_B,vec2_B);CHKERRQ(ierr);
506 
507   /* Neumann Solver */
508   ierr = PCISApplyInvSchur(pc,vec2_B,vec1_B,vec1_N,vec2_N);CHKERRQ(ierr);
509 
510   /*
511     Second balancing step.
512   */
513   {
514     PetscTruth flg;
515     ierr = PetscOptionsHasName(PETSC_NULL,"-pc_turn_off_second_balancing",&flg);CHKERRQ(ierr);
516     if (!flg) {
517       ierr = PCNNBalancing(pc,r,vec1_B,z,vec2_B,vec3_B,(Vec)0,vec1_D,vec2_D,work_N);CHKERRQ(ierr);
518     } else {
519       PetscScalar zero = 0.0;
520       ierr = VecPointwiseMult(pcis->D,vec1_B,vec2_B);CHKERRQ(ierr);
521       ierr = VecSet(&zero,z);CHKERRQ(ierr);
522       ierr = VecScatterBegin(vec2_B,z,ADD_VALUES,SCATTER_REVERSE,pcis->global_to_B);CHKERRQ(ierr);
523       ierr = VecScatterEnd  (vec2_B,z,ADD_VALUES,SCATTER_REVERSE,pcis->global_to_B);CHKERRQ(ierr);
524     }
525   }
526 
527   PetscFunctionReturn(0);
528 }
529 
530 /* -------------------------------------------------------------------------- */
531 /*
532    PCNNBalancing - Computes z, as given in equations (15) and (16) (if the
533                    input argument u is provided), or s, as given in equations
534                    (12) and (13), if the input argument u is a null vector.
535                    Notice that the input argument u plays the role of u_i in
536                    equation (14). The equation numbers refer to [Man93].
537 
538    Input Parameters:
539 .  pcnn - NN preconditioner context.
540 .  r - MPI vector of all nodes (interior and interface). It's preserved.
541 .  u - (Optional) sequential vector of local interface nodes. It's preserved UNLESS vec3_B is null.
542 
543    Output Parameters:
544 .  z - MPI vector of interior and interface nodes. Returns s or z (see description above).
545 .  vec1_B - Sequential vector of local interface nodes. Workspace.
546 .  vec2_B - Sequential vector of local interface nodes. Workspace.
547 .  vec3_B - (Optional) sequential vector of local interface nodes. Workspace.
548 .  vec1_D - Sequential vector of local interior nodes. Workspace.
549 .  vec2_D - Sequential vector of local interior nodes. Workspace.
550 .  work_N - Array of all local nodes (interior and interface). Workspace.
551 
552 */
553 #undef __FUNCT__
554 #define __FUNCT__ "PCNNBalancing"
555 int PCNNBalancing (PC pc, Vec r, Vec u, Vec z, Vec vec1_B, Vec vec2_B, Vec vec3_B,
556                    Vec vec1_D, Vec vec2_D, PetscScalar *work_N)
557 {
558   int            k, ierr;
559   PetscScalar    zero     =  0.0;
560   PetscScalar    m_one    = -1.0;
561   PetscScalar    value;
562   PetscScalar*   lambda;
563   PC_NN*         pcnn     = (PC_NN*)(pc->data);
564   PC_IS*         pcis     = (PC_IS*)(pc->data);
565 
566   PetscFunctionBegin;
567   ierr = PetscLogEventBegin(PC_ApplyCoarse,0,0,0,0);CHKERRQ(ierr);
568 
569   if (u) {
570     if (!vec3_B) { vec3_B = u; }
571     ierr = VecPointwiseMult(pcis->D,u,vec1_B);CHKERRQ(ierr);
572     ierr = VecSet(&zero,z);CHKERRQ(ierr);
573     ierr = VecScatterBegin(vec1_B,z,ADD_VALUES,SCATTER_REVERSE,pcis->global_to_B);CHKERRQ(ierr);
574     ierr = VecScatterEnd  (vec1_B,z,ADD_VALUES,SCATTER_REVERSE,pcis->global_to_B);CHKERRQ(ierr);
575     ierr = VecScatterBegin(z,vec2_B,INSERT_VALUES,SCATTER_FORWARD,pcis->global_to_B);CHKERRQ(ierr);
576     ierr = VecScatterEnd  (z,vec2_B,INSERT_VALUES,SCATTER_FORWARD,pcis->global_to_B);CHKERRQ(ierr);
577     ierr = PCISApplySchur(pc,vec2_B,vec3_B,(Vec)0,vec1_D,vec2_D);CHKERRQ(ierr);
578     ierr = VecScale(&m_one,vec3_B);CHKERRQ(ierr);
579     ierr = VecCopy(r,z);CHKERRQ(ierr);
580     ierr = VecScatterBegin(vec3_B,z,ADD_VALUES,SCATTER_REVERSE,pcis->global_to_B);CHKERRQ(ierr);
581     ierr = VecScatterEnd  (vec3_B,z,ADD_VALUES,SCATTER_REVERSE,pcis->global_to_B);CHKERRQ(ierr);
582   } else {
583     ierr = VecCopy(r,z);CHKERRQ(ierr);
584   }
585   ierr = VecScatterBegin(z,vec2_B,INSERT_VALUES,SCATTER_FORWARD,pcis->global_to_B);CHKERRQ(ierr);
586   ierr = VecScatterEnd  (z,vec2_B,INSERT_VALUES,SCATTER_FORWARD,pcis->global_to_B);CHKERRQ(ierr);
587   ierr = PCISScatterArrayNToVecB(work_N,vec2_B,INSERT_VALUES,SCATTER_REVERSE,pc);CHKERRQ(ierr);
588   for (k=0, value=0.0; k<pcis->n_shared[0]; k++) { value += pcnn->DZ_IN[0][k] * work_N[pcis->shared[0][k]]; }
589   value *= pcnn->factor_coarse_rhs;  /* This factor is set in CreateCoarseMatrix(). */
590   {
591     int rank;
592     ierr = MPI_Comm_rank(pc->comm,&rank);CHKERRQ(ierr);
593     ierr = VecSetValue(pcnn->coarse_b,rank,value,INSERT_VALUES);CHKERRQ(ierr);
594     /*
595        Since we are only inserting local values (one value actually) we don't need to do the
596        reduction that tells us there is no data that needs to be moved. Hence we comment out these
597        ierr = VecAssemblyBegin(pcnn->coarse_b);CHKERRQ(ierr);
598        ierr = VecAssemblyEnd  (pcnn->coarse_b);CHKERRQ(ierr);
599     */
600   }
601   ierr = KSPSolve(pcnn->ksp_coarse,pcnn->coarse_b,pcnn->coarse_x);CHKERRQ(ierr);
602   if (!u) { ierr = VecScale(&m_one,pcnn->coarse_x);CHKERRQ(ierr); }
603   ierr = VecGetArray(pcnn->coarse_x,&lambda);CHKERRQ(ierr);
604   for (k=0; k<pcis->n_shared[0]; k++) { work_N[pcis->shared[0][k]] = *lambda * pcnn->DZ_IN[0][k]; }
605   ierr = VecRestoreArray(pcnn->coarse_x,&lambda);CHKERRQ(ierr);
606   ierr = PCISScatterArrayNToVecB(work_N,vec2_B,INSERT_VALUES,SCATTER_FORWARD,pc);CHKERRQ(ierr);
607   ierr = VecSet(&zero,z);CHKERRQ(ierr);
608   ierr = VecScatterBegin(vec2_B,z,ADD_VALUES,SCATTER_REVERSE,pcis->global_to_B);CHKERRQ(ierr);
609   ierr = VecScatterEnd  (vec2_B,z,ADD_VALUES,SCATTER_REVERSE,pcis->global_to_B);CHKERRQ(ierr);
610   if (!u) {
611     ierr = VecScatterBegin(z,vec2_B,INSERT_VALUES,SCATTER_FORWARD,pcis->global_to_B);CHKERRQ(ierr);
612     ierr = VecScatterEnd  (z,vec2_B,INSERT_VALUES,SCATTER_FORWARD,pcis->global_to_B);CHKERRQ(ierr);
613     ierr = PCISApplySchur(pc,vec2_B,vec1_B,(Vec)0,vec1_D,vec2_D);CHKERRQ(ierr);
614     ierr = VecCopy(r,z);CHKERRQ(ierr);
615   }
616   ierr = VecScatterBegin(vec1_B,z,ADD_VALUES,SCATTER_REVERSE,pcis->global_to_B);CHKERRQ(ierr);
617   ierr = VecScatterEnd  (vec1_B,z,ADD_VALUES,SCATTER_REVERSE,pcis->global_to_B);CHKERRQ(ierr);
618   ierr = PetscLogEventEnd(PC_ApplyCoarse,0,0,0,0);CHKERRQ(ierr);
619 
620   PetscFunctionReturn(0);
621 }
622 
623 #undef __FUNCT__
624 
625 
626 
627 /*  -------   E N D   O F   T H E   C O D E   -------  */
628 /*                                                     */
629 /*  From now on, "footnotes" (or "historical notes").  */
630 /*                                                     */
631 /*  -------------------------------------------------  */
632 
633 
634 
635 /* --------------------------------------------------------------------------
636    Historical note 01
637    -------------------------------------------------------------------------- */
638 /*
639    We considered the possibility of an alternative D_i that would still
640    provide a partition of unity (i.e., $ \sum_i  N_i D_i N_i^T = I $).
641    The basic principle was still the pseudo-inverse of the counting
642    function; the difference was that we would not count subdomains
643    that do not contribute to the coarse space (i.e., not pure-Neumann
644    subdomains).
645 
646    This turned out to be a bad idea:  we would solve trivial Neumann
647    problems in the not pure-Neumann subdomains, since we would be scaling
648    the balanced residual by zero.
649 */
650 
651 
652 
653 
654 /* --------------------------------------------------------------------------
655    Historical note 02
656    -------------------------------------------------------------------------- */
657 /*
658    We tried an alternative coarse problem, that would eliminate exactly a
659    constant error. Turned out not to improve the overall convergence.
660 */
661 
662 
663