xref: /petsc/src/ksp/pc/impls/is/nn/nn.c (revision e0f5bfbec699682fa3e8b8532b1176849ea4e12a)
1 
2 #include <../src/ksp/pc/impls/is/nn/nn.h>
3 
4 /* -------------------------------------------------------------------------- */
5 /*
6    PCSetUp_NN - Prepares for the use of the NN preconditioner
7                     by setting data structures and options.
8 
9    Input Parameter:
10 .  pc - the preconditioner context
11 
12    Application Interface Routine: PCSetUp()
13 
14    Note:
15    The interface routine PCSetUp() is not usually called directly by
16    the user, but instead is called by PCApply() if necessary.
17 */
18 static PetscErrorCode PCSetUp_NN(PC pc) {
19   PetscFunctionBegin;
20   if (!pc->setupcalled) {
21     /* Set up all the "iterative substructuring" common block */
22     PetscCall(PCISSetUp(pc, PETSC_TRUE, PETSC_TRUE));
23     /* Create the coarse matrix. */
24     PetscCall(PCNNCreateCoarseMatrix(pc));
25   }
26   PetscFunctionReturn(0);
27 }
28 
29 /* -------------------------------------------------------------------------- */
30 /*
31    PCApply_NN - Applies the NN preconditioner to a vector.
32 
33    Input Parameters:
34 .  pc - the preconditioner context
35 .  r - input vector (global)
36 
37    Output Parameter:
38 .  z - output vector (global)
39 
40    Application Interface Routine: PCApply()
41  */
42 static PetscErrorCode PCApply_NN(PC pc, Vec r, Vec z) {
43   PC_IS      *pcis  = (PC_IS *)(pc->data);
44   PetscScalar m_one = -1.0;
45   Vec         w     = pcis->vec1_global;
46 
47   PetscFunctionBegin;
48   /*
49     Dirichlet solvers.
50     Solving $ B_I^{(i)}r_I^{(i)} $ at each processor.
51     Storing the local results at vec2_D
52   */
53   PetscCall(VecScatterBegin(pcis->global_to_D, r, pcis->vec1_D, INSERT_VALUES, SCATTER_FORWARD));
54   PetscCall(VecScatterEnd(pcis->global_to_D, r, pcis->vec1_D, INSERT_VALUES, SCATTER_FORWARD));
55   PetscCall(KSPSolve(pcis->ksp_D, pcis->vec1_D, pcis->vec2_D));
56 
57   /*
58     Computing $ r_B - \sum_j \tilde R_j^T A_{BI}^{(j)} (B_I^{(j)}r_I^{(j)}) $ .
59     Storing the result in the interface portion of the global vector w.
60   */
61   PetscCall(MatMult(pcis->A_BI, pcis->vec2_D, pcis->vec1_B));
62   PetscCall(VecScale(pcis->vec1_B, m_one));
63   PetscCall(VecCopy(r, w));
64   PetscCall(VecScatterBegin(pcis->global_to_B, pcis->vec1_B, w, ADD_VALUES, SCATTER_REVERSE));
65   PetscCall(VecScatterEnd(pcis->global_to_B, pcis->vec1_B, w, ADD_VALUES, SCATTER_REVERSE));
66 
67   /*
68     Apply the interface preconditioner
69   */
70   PetscCall(PCNNApplyInterfacePreconditioner(pc, w, z, pcis->work_N, pcis->vec1_B, pcis->vec2_B, pcis->vec3_B, pcis->vec1_D, pcis->vec3_D, pcis->vec1_N, pcis->vec2_N));
71 
72   /*
73     Computing $ t_I^{(i)} = A_{IB}^{(i)} \tilde R_i z_B $
74     The result is stored in vec1_D.
75   */
76   PetscCall(VecScatterBegin(pcis->global_to_B, z, pcis->vec1_B, INSERT_VALUES, SCATTER_FORWARD));
77   PetscCall(VecScatterEnd(pcis->global_to_B, z, pcis->vec1_B, INSERT_VALUES, SCATTER_FORWARD));
78   PetscCall(MatMult(pcis->A_IB, pcis->vec1_B, pcis->vec1_D));
79 
80   /*
81     Dirichlet solvers.
82     Computing $ B_I^{(i)}t_I^{(i)} $ and sticking into the global vector the blocks
83     $ B_I^{(i)}r_I^{(i)} - B_I^{(i)}t_I^{(i)} $.
84   */
85   PetscCall(VecScatterBegin(pcis->global_to_D, pcis->vec2_D, z, INSERT_VALUES, SCATTER_REVERSE));
86   PetscCall(VecScatterEnd(pcis->global_to_D, pcis->vec2_D, z, INSERT_VALUES, SCATTER_REVERSE));
87   PetscCall(KSPSolve(pcis->ksp_D, pcis->vec1_D, pcis->vec2_D));
88   PetscCall(VecScale(pcis->vec2_D, m_one));
89   PetscCall(VecScatterBegin(pcis->global_to_D, pcis->vec2_D, z, ADD_VALUES, SCATTER_REVERSE));
90   PetscCall(VecScatterEnd(pcis->global_to_D, pcis->vec2_D, z, ADD_VALUES, SCATTER_REVERSE));
91   PetscFunctionReturn(0);
92 }
93 
94 /* -------------------------------------------------------------------------- */
95 /*
96    PCDestroy_NN - Destroys the private context for the NN preconditioner
97    that was created with PCCreate_NN().
98 
99    Input Parameter:
100 .  pc - the preconditioner context
101 
102    Application Interface Routine: PCDestroy()
103 */
104 static PetscErrorCode PCDestroy_NN(PC pc) {
105   PC_NN *pcnn = (PC_NN *)pc->data;
106 
107   PetscFunctionBegin;
108   PetscCall(PCISDestroy(pc));
109 
110   PetscCall(MatDestroy(&pcnn->coarse_mat));
111   PetscCall(VecDestroy(&pcnn->coarse_x));
112   PetscCall(VecDestroy(&pcnn->coarse_b));
113   PetscCall(KSPDestroy(&pcnn->ksp_coarse));
114   if (pcnn->DZ_IN) {
115     PetscCall(PetscFree(pcnn->DZ_IN[0]));
116     PetscCall(PetscFree(pcnn->DZ_IN));
117   }
118 
119   /*
120       Free the private data structure that was hanging off the PC
121   */
122   PetscCall(PetscFree(pc->data));
123   PetscFunctionReturn(0);
124 }
125 
126 /*MC
127    PCNN - Balancing Neumann-Neumann for scalar elliptic PDEs.
128 
129    Options Database Keys:
130 +    -pc_nn_turn_off_first_balancing - do not balance the residual before solving the local Neumann problems
131                                        (this skips the first coarse grid solve in the preconditioner)
132 .    -pc_nn_turn_off_second_balancing - do not balance the solution solving the local Neumann problems
133                                        (this skips the second coarse grid solve in the preconditioner)
134 .    -pc_is_damp_fixed <fact> -
135 .    -pc_is_remove_nullspace_fixed -
136 .    -pc_is_set_damping_factor_floating <fact> -
137 .    -pc_is_not_damp_floating -
138 -    -pc_is_not_remove_nullspace_floating -
139 
140    Options Database prefixes for the subsolvers this preconditioner uses:
141 +  -nn_coarse_pc_ - for the coarse grid preconditioner
142 .  -is_localD_pc_ - for the Dirichlet subproblem preconditioner
143 -  -is_localN_pc_ - for the Neumann subproblem preconditioner
144 
145    Level: intermediate
146 
147    Notes:
148     The matrix used with this preconditioner must be of type `MATIS`
149 
150           Unlike more 'conventional' Neumann-Neumann preconditioners this iterates over ALL the
151           degrees of freedom, NOT just those on the interface (this allows the use of approximate solvers
152           on the subdomains; though in our experience using approximate solvers is slower.).
153 
154    Contributed by Paulo Goldfeld
155 
156 .seealso: `PCCreate()`, `PCSetType()`, `PCType`, `PC`, `MATIS`, `PCBDDC`
157 M*/
158 
159 PETSC_EXTERN PetscErrorCode PCCreate_NN(PC pc) {
160   PC_NN *pcnn;
161 
162   PetscFunctionBegin;
163   /*
164      Creates the private data structure for this preconditioner and
165      attach it to the PC object.
166   */
167   PetscCall(PetscNew(&pcnn));
168   pc->data = (void *)pcnn;
169 
170   PetscCall(PCISCreate(pc));
171   pcnn->coarse_mat = NULL;
172   pcnn->coarse_x   = NULL;
173   pcnn->coarse_b   = NULL;
174   pcnn->ksp_coarse = NULL;
175   pcnn->DZ_IN      = NULL;
176 
177   /*
178       Set the pointers for the functions that are provided above.
179       Now when the user-level routines (such as PCApply(), PCDestroy(), etc.)
180       are called, they will automatically call these functions.  Note we
181       choose not to provide a couple of these functions since they are
182       not needed.
183   */
184   pc->ops->apply               = PCApply_NN;
185   pc->ops->applytranspose      = NULL;
186   pc->ops->setup               = PCSetUp_NN;
187   pc->ops->destroy             = PCDestroy_NN;
188   pc->ops->view                = NULL;
189   pc->ops->applyrichardson     = NULL;
190   pc->ops->applysymmetricleft  = NULL;
191   pc->ops->applysymmetricright = NULL;
192   PetscFunctionReturn(0);
193 }
194 
195 /*
196    PCNNCreateCoarseMatrix -
197 */
198 PetscErrorCode PCNNCreateCoarseMatrix(PC pc) {
199   MPI_Request  *send_request, *recv_request;
200   PetscInt      i, j, k;
201   PetscScalar  *mat;    /* Sub-matrix with this subdomain's contribution to the coarse matrix             */
202   PetscScalar **DZ_OUT; /* proc[k].DZ_OUT[i][] = bit of vector to be sent from processor k to processor i */
203 
204   /* aliasing some names */
205   PC_IS        *pcis     = (PC_IS *)(pc->data);
206   PC_NN        *pcnn     = (PC_NN *)pc->data;
207   PetscInt      n_neigh  = pcis->n_neigh;
208   PetscInt     *neigh    = pcis->neigh;
209   PetscInt     *n_shared = pcis->n_shared;
210   PetscInt    **shared   = pcis->shared;
211   PetscScalar **DZ_IN; /* Must be initialized after memory allocation. */
212 
213   PetscFunctionBegin;
214   /* Allocate memory for mat (the +1 is to handle the case n_neigh equal to zero) */
215   PetscCall(PetscMalloc1(n_neigh * n_neigh + 1, &mat));
216 
217   /* Allocate memory for DZ */
218   /* Notice that DZ_OUT[0] is allocated some space that is never used. */
219   /* This is just in order to DZ_OUT and DZ_IN to have exactly the same form. */
220   {
221     PetscInt size_of_Z = 0;
222     PetscCall(PetscMalloc((n_neigh + 1) * sizeof(PetscScalar *), &pcnn->DZ_IN));
223     DZ_IN = pcnn->DZ_IN;
224     PetscCall(PetscMalloc((n_neigh + 1) * sizeof(PetscScalar *), &DZ_OUT));
225     for (i = 0; i < n_neigh; i++) size_of_Z += n_shared[i];
226     PetscCall(PetscMalloc((size_of_Z + 1) * sizeof(PetscScalar), &DZ_IN[0]));
227     PetscCall(PetscMalloc((size_of_Z + 1) * sizeof(PetscScalar), &DZ_OUT[0]));
228   }
229   for (i = 1; i < n_neigh; i++) {
230     DZ_IN[i]  = DZ_IN[i - 1] + n_shared[i - 1];
231     DZ_OUT[i] = DZ_OUT[i - 1] + n_shared[i - 1];
232   }
233 
234   /* Set the values of DZ_OUT, in order to send this info to the neighbours */
235   /* First, set the auxiliary array pcis->work_N. */
236   PetscCall(PCISScatterArrayNToVecB(pcis->work_N, pcis->D, INSERT_VALUES, SCATTER_REVERSE, pc));
237   for (i = 1; i < n_neigh; i++) {
238     for (j = 0; j < n_shared[i]; j++) DZ_OUT[i][j] = pcis->work_N[shared[i][j]];
239   }
240 
241   /* Non-blocking send/receive the common-interface chunks of scaled nullspaces */
242   /* Notice that send_request[] and recv_request[] could have one less element. */
243   /* We make them longer to have request[i] corresponding to neigh[i].          */
244   {
245     PetscMPIInt tag;
246     PetscCall(PetscObjectGetNewTag((PetscObject)pc, &tag));
247     PetscCall(PetscMalloc2(n_neigh + 1, &send_request, n_neigh + 1, &recv_request));
248     for (i = 1; i < n_neigh; i++) {
249       PetscCallMPI(MPI_Isend((void *)(DZ_OUT[i]), n_shared[i], MPIU_SCALAR, neigh[i], tag, PetscObjectComm((PetscObject)pc), &(send_request[i])));
250       PetscCallMPI(MPI_Irecv((void *)(DZ_IN[i]), n_shared[i], MPIU_SCALAR, neigh[i], tag, PetscObjectComm((PetscObject)pc), &(recv_request[i])));
251     }
252   }
253 
254   /* Set DZ_IN[0][] (recall that neigh[0]==rank, always) */
255   for (j = 0; j < n_shared[0]; j++) DZ_IN[0][j] = pcis->work_N[shared[0][j]];
256 
257   /* Start computing with local D*Z while communication goes on.    */
258   /* Apply Schur complement. The result is "stored" in vec (more    */
259   /* precisely, vec points to the result, stored in pc_nn->vec1_B)  */
260   /* and also scattered to pcnn->work_N.                            */
261   PetscCall(PCNNApplySchurToChunk(pc, n_shared[0], shared[0], DZ_IN[0], pcis->work_N, pcis->vec1_B, pcis->vec2_B, pcis->vec1_D, pcis->vec2_D));
262 
263   /* Compute the first column, while completing the receiving. */
264   for (i = 0; i < n_neigh; i++) {
265     MPI_Status  stat;
266     PetscMPIInt ind = 0;
267     if (i > 0) {
268       PetscCallMPI(MPI_Waitany(n_neigh - 1, recv_request + 1, &ind, &stat));
269       ind++;
270     }
271     mat[ind * n_neigh + 0] = 0.0;
272     for (k = 0; k < n_shared[ind]; k++) mat[ind * n_neigh + 0] += DZ_IN[ind][k] * pcis->work_N[shared[ind][k]];
273   }
274 
275   /* Compute the remaining of the columns */
276   for (j = 1; j < n_neigh; j++) {
277     PetscCall(PCNNApplySchurToChunk(pc, n_shared[j], shared[j], DZ_IN[j], pcis->work_N, pcis->vec1_B, pcis->vec2_B, pcis->vec1_D, pcis->vec2_D));
278     for (i = 0; i < n_neigh; i++) {
279       mat[i * n_neigh + j] = 0.0;
280       for (k = 0; k < n_shared[i]; k++) mat[i * n_neigh + j] += DZ_IN[i][k] * pcis->work_N[shared[i][k]];
281     }
282   }
283 
284   /* Complete the sending. */
285   if (n_neigh > 1) {
286     MPI_Status *stat;
287     PetscCall(PetscMalloc1(n_neigh - 1, &stat));
288     if (n_neigh - 1) PetscCallMPI(MPI_Waitall(n_neigh - 1, &(send_request[1]), stat));
289     PetscCall(PetscFree(stat));
290   }
291 
292   /* Free the memory for the MPI requests */
293   PetscCall(PetscFree2(send_request, recv_request));
294 
295   /* Free the memory for DZ_OUT */
296   if (DZ_OUT) {
297     PetscCall(PetscFree(DZ_OUT[0]));
298     PetscCall(PetscFree(DZ_OUT));
299   }
300 
301   {
302     PetscMPIInt size;
303     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)pc), &size));
304     /* Create the global coarse vectors (rhs and solution). */
305     PetscCall(VecCreateMPI(PetscObjectComm((PetscObject)pc), 1, size, &(pcnn->coarse_b)));
306     PetscCall(VecDuplicate(pcnn->coarse_b, &(pcnn->coarse_x)));
307     /* Create and set the global coarse AIJ matrix. */
308     PetscCall(MatCreate(PetscObjectComm((PetscObject)pc), &(pcnn->coarse_mat)));
309     PetscCall(MatSetSizes(pcnn->coarse_mat, 1, 1, size, size));
310     PetscCall(MatSetType(pcnn->coarse_mat, MATAIJ));
311     PetscCall(MatSeqAIJSetPreallocation(pcnn->coarse_mat, 1, NULL));
312     PetscCall(MatMPIAIJSetPreallocation(pcnn->coarse_mat, 1, NULL, n_neigh, NULL));
313     PetscCall(MatSetOption(pcnn->coarse_mat, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE));
314     PetscCall(MatSetOption(pcnn->coarse_mat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_FALSE));
315     PetscCall(MatSetValues(pcnn->coarse_mat, n_neigh, neigh, n_neigh, neigh, mat, ADD_VALUES));
316     PetscCall(MatAssemblyBegin(pcnn->coarse_mat, MAT_FINAL_ASSEMBLY));
317     PetscCall(MatAssemblyEnd(pcnn->coarse_mat, MAT_FINAL_ASSEMBLY));
318   }
319 
320   {
321     PetscMPIInt rank;
322     PetscScalar one = 1.0;
323     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)pc), &rank));
324     /* "Zero out" rows of not-purely-Neumann subdomains */
325     if (pcis->pure_neumann) { /* does NOT zero the row; create an empty index set. The reason is that MatZeroRows() is collective. */
326       PetscCall(MatZeroRows(pcnn->coarse_mat, 0, NULL, one, NULL, NULL));
327     } else { /* here it DOES zero the row, since it's not a floating subdomain. */
328       PetscInt row = (PetscInt)rank;
329       PetscCall(MatZeroRows(pcnn->coarse_mat, 1, &row, one, NULL, NULL));
330     }
331   }
332 
333   /* Create the coarse linear solver context */
334   {
335     PC  pc_ctx, inner_pc;
336     KSP inner_ksp;
337 
338     PetscCall(KSPCreate(PetscObjectComm((PetscObject)pc), &pcnn->ksp_coarse));
339     PetscCall(PetscObjectIncrementTabLevel((PetscObject)pcnn->ksp_coarse, (PetscObject)pc, 2));
340     PetscCall(KSPSetOperators(pcnn->ksp_coarse, pcnn->coarse_mat, pcnn->coarse_mat));
341     PetscCall(KSPGetPC(pcnn->ksp_coarse, &pc_ctx));
342     PetscCall(PCSetType(pc_ctx, PCREDUNDANT));
343     PetscCall(KSPSetType(pcnn->ksp_coarse, KSPPREONLY));
344     PetscCall(PCRedundantGetKSP(pc_ctx, &inner_ksp));
345     PetscCall(KSPGetPC(inner_ksp, &inner_pc));
346     PetscCall(PCSetType(inner_pc, PCLU));
347     PetscCall(KSPSetOptionsPrefix(pcnn->ksp_coarse, "nn_coarse_"));
348     PetscCall(KSPSetFromOptions(pcnn->ksp_coarse));
349     /* the vectors in the following line are dummy arguments, just telling the KSP the vector size. Values are not used */
350     PetscCall(KSPSetUp(pcnn->ksp_coarse));
351   }
352 
353   /* Free the memory for mat */
354   PetscCall(PetscFree(mat));
355 
356   /* for DEBUGGING, save the coarse matrix to a file. */
357   {
358     PetscBool flg = PETSC_FALSE;
359     PetscCall(PetscOptionsGetBool(NULL, NULL, "-pc_nn_save_coarse_matrix", &flg, NULL));
360     if (flg) {
361       PetscViewer viewer;
362       PetscCall(PetscViewerASCIIOpen(PETSC_COMM_WORLD, "coarse.m", &viewer));
363       PetscCall(PetscViewerPushFormat(viewer, PETSC_VIEWER_ASCII_MATLAB));
364       PetscCall(MatView(pcnn->coarse_mat, viewer));
365       PetscCall(PetscViewerPopFormat(viewer));
366       PetscCall(PetscViewerDestroy(&viewer));
367     }
368   }
369 
370   /*  Set the variable pcnn->factor_coarse_rhs. */
371   pcnn->factor_coarse_rhs = (pcis->pure_neumann) ? 1.0 : 0.0;
372 
373   /* See historical note 02, at the bottom of this file. */
374   PetscFunctionReturn(0);
375 }
376 
377 /*
378    PCNNApplySchurToChunk -
379 
380    Input parameters:
381 .  pcnn
382 .  n - size of chunk
383 .  idx - indices of chunk
384 .  chunk - values
385 
386    Output parameters:
387 .  array_N - result of Schur complement applied to chunk, scattered to big array
388 .  vec1_B  - result of Schur complement applied to chunk
389 .  vec2_B  - garbage (used as work space)
390 .  vec1_D  - garbage (used as work space)
391 .  vec2_D  - garbage (used as work space)
392 
393 */
394 PetscErrorCode PCNNApplySchurToChunk(PC pc, PetscInt n, PetscInt *idx, PetscScalar *chunk, PetscScalar *array_N, Vec vec1_B, Vec vec2_B, Vec vec1_D, Vec vec2_D) {
395   PetscInt i;
396   PC_IS   *pcis = (PC_IS *)(pc->data);
397 
398   PetscFunctionBegin;
399   PetscCall(PetscArrayzero(array_N, pcis->n));
400   for (i = 0; i < n; i++) array_N[idx[i]] = chunk[i];
401   PetscCall(PCISScatterArrayNToVecB(array_N, vec2_B, INSERT_VALUES, SCATTER_FORWARD, pc));
402   PetscCall(PCISApplySchur(pc, vec2_B, vec1_B, (Vec)0, vec1_D, vec2_D));
403   PetscCall(PCISScatterArrayNToVecB(array_N, vec1_B, INSERT_VALUES, SCATTER_REVERSE, pc));
404   PetscFunctionReturn(0);
405 }
406 
407 /*
408    PCNNApplyInterfacePreconditioner - Apply the interface preconditioner, i.e.,
409                                       the preconditioner for the Schur complement.
410 
411    Input parameter:
412 .  r - global vector of interior and interface nodes. The values on the interior nodes are NOT used.
413 
414    Output parameters:
415 .  z - global vector of interior and interface nodes. The values on the interface are the result of
416        the application of the interface preconditioner to the interface part of r. The values on the
417        interior nodes are garbage.
418 .  work_N - array of local nodes (interior and interface, including ghosts); returns garbage (used as work space)
419 .  vec1_B - vector of local interface nodes (including ghosts); returns garbage (used as work space)
420 .  vec2_B - vector of local interface nodes (including ghosts); returns garbage (used as work space)
421 .  vec3_B - vector of local interface nodes (including ghosts); returns garbage (used as work space)
422 .  vec1_D - vector of local interior nodes; returns garbage (used as work space)
423 .  vec2_D - vector of local interior nodes; returns garbage (used as work space)
424 .  vec1_N - vector of local nodes (interior and interface, including ghosts); returns garbage (used as work space)
425 .  vec2_N - vector of local nodes (interior and interface, including ghosts); returns garbage (used as work space)
426 
427 */
428 PetscErrorCode PCNNApplyInterfacePreconditioner(PC pc, Vec r, Vec z, PetscScalar *work_N, Vec vec1_B, Vec vec2_B, Vec vec3_B, Vec vec1_D, Vec vec2_D, Vec vec1_N, Vec vec2_N) {
429   PC_IS *pcis = (PC_IS *)(pc->data);
430 
431   PetscFunctionBegin;
432   /*
433     First balancing step.
434   */
435   {
436     PetscBool flg = PETSC_FALSE;
437     PetscCall(PetscOptionsGetBool(NULL, NULL, "-pc_nn_turn_off_first_balancing", &flg, NULL));
438     if (!flg) {
439       PetscCall(PCNNBalancing(pc, r, (Vec)0, z, vec1_B, vec2_B, (Vec)0, vec1_D, vec2_D, work_N));
440     } else {
441       PetscCall(VecCopy(r, z));
442     }
443   }
444 
445   /*
446     Extract the local interface part of z and scale it by D
447   */
448   PetscCall(VecScatterBegin(pcis->global_to_B, z, vec1_B, INSERT_VALUES, SCATTER_FORWARD));
449   PetscCall(VecScatterEnd(pcis->global_to_B, z, vec1_B, INSERT_VALUES, SCATTER_FORWARD));
450   PetscCall(VecPointwiseMult(vec2_B, pcis->D, vec1_B));
451 
452   /* Neumann Solver */
453   PetscCall(PCISApplyInvSchur(pc, vec2_B, vec1_B, vec1_N, vec2_N));
454 
455   /*
456     Second balancing step.
457   */
458   {
459     PetscBool flg = PETSC_FALSE;
460     PetscCall(PetscOptionsGetBool(NULL, NULL, "-pc_turn_off_second_balancing", &flg, NULL));
461     if (!flg) {
462       PetscCall(PCNNBalancing(pc, r, vec1_B, z, vec2_B, vec3_B, (Vec)0, vec1_D, vec2_D, work_N));
463     } else {
464       PetscCall(VecPointwiseMult(vec2_B, pcis->D, vec1_B));
465       PetscCall(VecSet(z, 0.0));
466       PetscCall(VecScatterBegin(pcis->global_to_B, vec2_B, z, ADD_VALUES, SCATTER_REVERSE));
467       PetscCall(VecScatterEnd(pcis->global_to_B, vec2_B, z, ADD_VALUES, SCATTER_REVERSE));
468     }
469   }
470   PetscFunctionReturn(0);
471 }
472 
473 /*
474    PCNNBalancing - Computes z, as given in equations (15) and (16) (if the
475                    input argument u is provided), or s, as given in equations
476                    (12) and (13), if the input argument u is a null vector.
477                    Notice that the input argument u plays the role of u_i in
478                    equation (14). The equation numbers refer to [Man93].
479 
480    Input Parameters:
481 .  pcnn - NN preconditioner context.
482 .  r - MPI vector of all nodes (interior and interface). It's preserved.
483 .  u - (Optional) sequential vector of local interface nodes. It's preserved UNLESS vec3_B is null.
484 
485    Output Parameters:
486 .  z - MPI vector of interior and interface nodes. Returns s or z (see description above).
487 .  vec1_B - Sequential vector of local interface nodes. Workspace.
488 .  vec2_B - Sequential vector of local interface nodes. Workspace.
489 .  vec3_B - (Optional) sequential vector of local interface nodes. Workspace.
490 .  vec1_D - Sequential vector of local interior nodes. Workspace.
491 .  vec2_D - Sequential vector of local interior nodes. Workspace.
492 .  work_N - Array of all local nodes (interior and interface). Workspace.
493 
494 */
495 PetscErrorCode PCNNBalancing(PC pc, Vec r, Vec u, Vec z, Vec vec1_B, Vec vec2_B, Vec vec3_B, Vec vec1_D, Vec vec2_D, PetscScalar *work_N) {
496   PetscInt     k;
497   PetscScalar  value;
498   PetscScalar *lambda;
499   PC_NN       *pcnn = (PC_NN *)(pc->data);
500   PC_IS       *pcis = (PC_IS *)(pc->data);
501 
502   PetscFunctionBegin;
503   PetscCall(PetscLogEventBegin(PC_ApplyCoarse, pc, 0, 0, 0));
504   if (u) {
505     if (!vec3_B) vec3_B = u;
506     PetscCall(VecPointwiseMult(vec1_B, pcis->D, u));
507     PetscCall(VecSet(z, 0.0));
508     PetscCall(VecScatterBegin(pcis->global_to_B, vec1_B, z, ADD_VALUES, SCATTER_REVERSE));
509     PetscCall(VecScatterEnd(pcis->global_to_B, vec1_B, z, ADD_VALUES, SCATTER_REVERSE));
510     PetscCall(VecScatterBegin(pcis->global_to_B, z, vec2_B, INSERT_VALUES, SCATTER_FORWARD));
511     PetscCall(VecScatterEnd(pcis->global_to_B, z, vec2_B, INSERT_VALUES, SCATTER_FORWARD));
512     PetscCall(PCISApplySchur(pc, vec2_B, vec3_B, (Vec)0, vec1_D, vec2_D));
513     PetscCall(VecScale(vec3_B, -1.0));
514     PetscCall(VecCopy(r, z));
515     PetscCall(VecScatterBegin(pcis->global_to_B, vec3_B, z, ADD_VALUES, SCATTER_REVERSE));
516     PetscCall(VecScatterEnd(pcis->global_to_B, vec3_B, z, ADD_VALUES, SCATTER_REVERSE));
517   } else {
518     PetscCall(VecCopy(r, z));
519   }
520   PetscCall(VecScatterBegin(pcis->global_to_B, z, vec2_B, INSERT_VALUES, SCATTER_FORWARD));
521   PetscCall(VecScatterEnd(pcis->global_to_B, z, vec2_B, INSERT_VALUES, SCATTER_FORWARD));
522   PetscCall(PCISScatterArrayNToVecB(work_N, vec2_B, INSERT_VALUES, SCATTER_REVERSE, pc));
523   for (k = 0, value = 0.0; k < pcis->n_shared[0]; k++) value += pcnn->DZ_IN[0][k] * work_N[pcis->shared[0][k]];
524   value *= pcnn->factor_coarse_rhs; /* This factor is set in CreateCoarseMatrix(). */
525   {
526     PetscMPIInt rank;
527     PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)pc), &rank));
528     PetscCall(VecSetValue(pcnn->coarse_b, rank, value, INSERT_VALUES));
529     /*
530        Since we are only inserting local values (one value actually) we don't need to do the
531        reduction that tells us there is no data that needs to be moved. Hence we comment out these
532        PetscCall(VecAssemblyBegin(pcnn->coarse_b));
533        PetscCall(VecAssemblyEnd  (pcnn->coarse_b));
534     */
535   }
536   PetscCall(KSPSolve(pcnn->ksp_coarse, pcnn->coarse_b, pcnn->coarse_x));
537   if (!u) PetscCall(VecScale(pcnn->coarse_x, -1.0));
538   PetscCall(VecGetArray(pcnn->coarse_x, &lambda));
539   for (k = 0; k < pcis->n_shared[0]; k++) work_N[pcis->shared[0][k]] = *lambda * pcnn->DZ_IN[0][k];
540   PetscCall(VecRestoreArray(pcnn->coarse_x, &lambda));
541   PetscCall(PCISScatterArrayNToVecB(work_N, vec2_B, INSERT_VALUES, SCATTER_FORWARD, pc));
542   PetscCall(VecSet(z, 0.0));
543   PetscCall(VecScatterBegin(pcis->global_to_B, vec2_B, z, ADD_VALUES, SCATTER_REVERSE));
544   PetscCall(VecScatterEnd(pcis->global_to_B, vec2_B, z, ADD_VALUES, SCATTER_REVERSE));
545   if (!u) {
546     PetscCall(VecScatterBegin(pcis->global_to_B, z, vec2_B, INSERT_VALUES, SCATTER_FORWARD));
547     PetscCall(VecScatterEnd(pcis->global_to_B, z, vec2_B, INSERT_VALUES, SCATTER_FORWARD));
548     PetscCall(PCISApplySchur(pc, vec2_B, vec1_B, (Vec)0, vec1_D, vec2_D));
549     PetscCall(VecCopy(r, z));
550   }
551   PetscCall(VecScatterBegin(pcis->global_to_B, vec1_B, z, ADD_VALUES, SCATTER_REVERSE));
552   PetscCall(VecScatterEnd(pcis->global_to_B, vec1_B, z, ADD_VALUES, SCATTER_REVERSE));
553   PetscCall(PetscLogEventEnd(PC_ApplyCoarse, pc, 0, 0, 0));
554   PetscFunctionReturn(0);
555 }
556 
557 /*                                                     */
558 /*  From now on, "footnotes" (or "historical notes").  */
559 /*                                                     */
560 /*
561    Historical note 01
562 
563    We considered the possibility of an alternative D_i that would still
564    provide a partition of unity (i.e., $ \sum_i  N_i D_i N_i^T = I $).
565    The basic principle was still the pseudo-inverse of the counting
566    function; the difference was that we would not count subdomains
567    that do not contribute to the coarse space (i.e., not pure-Neumann
568    subdomains).
569 
570    This turned out to be a bad idea:  we would solve trivial Neumann
571    problems in the not pure-Neumann subdomains, since we would be scaling
572    the balanced residual by zero.
573 */
574 
575 /*
576    Historical note 02
577 
578    We tried an alternative coarse problem, that would eliminate exactly a
579    constant error. Turned out not to improve the overall convergence.
580 */
581