xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 90df33566319c93f10359b4e87408aff0f85ca0c)
1 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
2 #include <petsc/private/vecimpl.h>
3 #include <petsc/private/sfimpl.h>
4 #include <petsc/private/isimpl.h>
5 #include <petscblaslapack.h>
6 #include <petscsf.h>
7 #include <petsc/private/hashmapi.h>
8 
9 PetscErrorCode MatDestroy_MPIAIJ(Mat mat)
10 {
11   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
12 
13   PetscFunctionBegin;
14   PetscCall(PetscLogObjectState((PetscObject)mat, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT, mat->rmap->N, mat->cmap->N));
15   PetscCall(MatStashDestroy_Private(&mat->stash));
16   PetscCall(VecDestroy(&aij->diag));
17   PetscCall(MatDestroy(&aij->A));
18   PetscCall(MatDestroy(&aij->B));
19 #if defined(PETSC_USE_CTABLE)
20   PetscCall(PetscHMapIDestroy(&aij->colmap));
21 #else
22   PetscCall(PetscFree(aij->colmap));
23 #endif
24   PetscCall(PetscFree(aij->garray));
25   PetscCall(VecDestroy(&aij->lvec));
26   PetscCall(VecScatterDestroy(&aij->Mvctx));
27   PetscCall(PetscFree2(aij->rowvalues, aij->rowindices));
28   PetscCall(PetscFree(aij->ld));
29 
30   PetscCall(PetscFree(mat->data));
31 
32   /* may be created by MatCreateMPIAIJSumSeqAIJSymbolic */
33   PetscCall(PetscObjectCompose((PetscObject)mat, "MatMergeSeqsToMPI", NULL));
34 
35   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, NULL));
36   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatStoreValues_C", NULL));
37   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatRetrieveValues_C", NULL));
38   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatIsTranspose_C", NULL));
39   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetPreallocation_C", NULL));
40   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatResetPreallocation_C", NULL));
41   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetPreallocationCSR_C", NULL));
42   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDiagonalScaleLocal_C", NULL));
43   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpibaij_C", NULL));
44   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpisbaij_C", NULL));
45 #if defined(PETSC_HAVE_CUDA)
46   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijcusparse_C", NULL));
47 #endif
48 #if defined(PETSC_HAVE_HIP)
49   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijhipsparse_C", NULL));
50 #endif
51 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
52   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijkokkos_C", NULL));
53 #endif
54   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpidense_C", NULL));
55 #if defined(PETSC_HAVE_ELEMENTAL)
56   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_elemental_C", NULL));
57 #endif
58 #if defined(PETSC_HAVE_SCALAPACK)
59   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_scalapack_C", NULL));
60 #endif
61 #if defined(PETSC_HAVE_HYPRE)
62   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_hypre_C", NULL));
63   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_transpose_mpiaij_mpiaij_C", NULL));
64 #endif
65   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_is_C", NULL));
66   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_is_mpiaij_C", NULL));
67   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpiaij_C", NULL));
68   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetUseScalableIncreaseOverlap_C", NULL));
69   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijperm_C", NULL));
70   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijsell_C", NULL));
71 #if defined(PETSC_HAVE_MKL_SPARSE)
72   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijmkl_C", NULL));
73 #endif
74   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijcrl_C", NULL));
75   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_is_C", NULL));
76   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpisell_C", NULL));
77   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatSetPreallocationCOO_C", NULL));
78   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatSetValuesCOO_C", NULL));
79   PetscFunctionReturn(PETSC_SUCCESS);
80 }
81 
82 /* defines MatSetValues_MPI_Hash(), MatAssemblyBegin_MPI_Hash(), and  MatAssemblyEnd_MPI_Hash() */
83 #define TYPE AIJ
84 #define TYPE_AIJ
85 #include "../src/mat/impls/aij/mpi/mpihashmat.h"
86 #undef TYPE
87 #undef TYPE_AIJ
88 
89 static PetscErrorCode MatGetRowIJ_MPIAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *m, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
90 {
91   Mat B;
92 
93   PetscFunctionBegin;
94   PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, &B));
95   PetscCall(PetscObjectCompose((PetscObject)A, "MatGetRowIJ_MPIAIJ", (PetscObject)B));
96   PetscCall(MatGetRowIJ(B, oshift, symmetric, inodecompressed, m, ia, ja, done));
97   PetscCall(MatDestroy(&B));
98   PetscFunctionReturn(PETSC_SUCCESS);
99 }
100 
101 static PetscErrorCode MatRestoreRowIJ_MPIAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *m, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
102 {
103   Mat B;
104 
105   PetscFunctionBegin;
106   PetscCall(PetscObjectQuery((PetscObject)A, "MatGetRowIJ_MPIAIJ", (PetscObject *)&B));
107   PetscCall(MatRestoreRowIJ(B, oshift, symmetric, inodecompressed, m, ia, ja, done));
108   PetscCall(PetscObjectCompose((PetscObject)A, "MatGetRowIJ_MPIAIJ", NULL));
109   PetscFunctionReturn(PETSC_SUCCESS);
110 }
111 
112 /*MC
113    MATAIJ - MATAIJ = "aij" - A matrix type to be used for sparse matrices.
114 
115    This matrix type is identical to` MATSEQAIJ` when constructed with a single process communicator,
116    and `MATMPIAIJ` otherwise.  As a result, for single process communicators,
117   `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
118   for communicators controlling multiple processes.  It is recommended that you call both of
119   the above preallocation routines for simplicity.
120 
121    Options Database Key:
122 . -mat_type aij - sets the matrix type to `MATAIJ` during a call to `MatSetFromOptions()`
123 
124   Developer Note:
125   Level: beginner
126 
127     Subclasses include `MATAIJCUSPARSE`, `MATAIJPERM`, `MATAIJSELL`, `MATAIJMKL`, `MATAIJCRL`, `MATAIJKOKKOS`,and also automatically switches over to use inodes when
128    enough exist.
129 
130 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MATSEQAIJ`, `MatCreateAIJ()`, `MatCreateSeqAIJ()`, `MATSEQAIJ`, `MATMPIAIJ`
131 M*/
132 
133 /*MC
134    MATAIJCRL - MATAIJCRL = "aijcrl" - A matrix type to be used for sparse matrices.
135 
136    This matrix type is identical to `MATSEQAIJCRL` when constructed with a single process communicator,
137    and `MATMPIAIJCRL` otherwise.  As a result, for single process communicators,
138    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
139   for communicators controlling multiple processes.  It is recommended that you call both of
140   the above preallocation routines for simplicity.
141 
142    Options Database Key:
143 . -mat_type aijcrl - sets the matrix type to `MATMPIAIJCRL` during a call to `MatSetFromOptions()`
144 
145   Level: beginner
146 
147 .seealso: [](ch_matrices), `Mat`, `MatCreateMPIAIJCRL`, `MATSEQAIJCRL`, `MATMPIAIJCRL`, `MATSEQAIJCRL`, `MATMPIAIJCRL`
148 M*/
149 
150 static PetscErrorCode MatBindToCPU_MPIAIJ(Mat A, PetscBool flg)
151 {
152   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
153 
154   PetscFunctionBegin;
155 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP) || defined(PETSC_HAVE_VIENNACL)
156   A->boundtocpu = flg;
157 #endif
158   if (a->A) PetscCall(MatBindToCPU(a->A, flg));
159   if (a->B) PetscCall(MatBindToCPU(a->B, flg));
160 
161   /* In addition to binding the diagonal and off-diagonal matrices, bind the local vectors used for matrix-vector products.
162    * This maybe seems a little odd for a MatBindToCPU() call to do, but it makes no sense for the binding of these vectors
163    * to differ from the parent matrix. */
164   if (a->lvec) PetscCall(VecBindToCPU(a->lvec, flg));
165   if (a->diag) PetscCall(VecBindToCPU(a->diag, flg));
166   PetscFunctionReturn(PETSC_SUCCESS);
167 }
168 
169 static PetscErrorCode MatSetBlockSizes_MPIAIJ(Mat M, PetscInt rbs, PetscInt cbs)
170 {
171   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)M->data;
172 
173   PetscFunctionBegin;
174   if (mat->A) {
175     PetscCall(MatSetBlockSizes(mat->A, rbs, cbs));
176     PetscCall(MatSetBlockSizes(mat->B, rbs, 1));
177   }
178   PetscFunctionReturn(PETSC_SUCCESS);
179 }
180 
181 static PetscErrorCode MatFindNonzeroRows_MPIAIJ(Mat M, IS *keptrows)
182 {
183   Mat_MPIAIJ      *mat = (Mat_MPIAIJ *)M->data;
184   Mat_SeqAIJ      *a   = (Mat_SeqAIJ *)mat->A->data;
185   Mat_SeqAIJ      *b   = (Mat_SeqAIJ *)mat->B->data;
186   const PetscInt  *ia, *ib;
187   const MatScalar *aa, *bb, *aav, *bav;
188   PetscInt         na, nb, i, j, *rows, cnt = 0, n0rows;
189   PetscInt         m = M->rmap->n, rstart = M->rmap->rstart;
190 
191   PetscFunctionBegin;
192   *keptrows = NULL;
193 
194   ia = a->i;
195   ib = b->i;
196   PetscCall(MatSeqAIJGetArrayRead(mat->A, &aav));
197   PetscCall(MatSeqAIJGetArrayRead(mat->B, &bav));
198   for (i = 0; i < m; i++) {
199     na = ia[i + 1] - ia[i];
200     nb = ib[i + 1] - ib[i];
201     if (!na && !nb) {
202       cnt++;
203       goto ok1;
204     }
205     aa = aav + ia[i];
206     for (j = 0; j < na; j++) {
207       if (aa[j] != 0.0) goto ok1;
208     }
209     bb = PetscSafePointerPlusOffset(bav, ib[i]);
210     for (j = 0; j < nb; j++) {
211       if (bb[j] != 0.0) goto ok1;
212     }
213     cnt++;
214   ok1:;
215   }
216   PetscCallMPI(MPIU_Allreduce(&cnt, &n0rows, 1, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)M)));
217   if (!n0rows) {
218     PetscCall(MatSeqAIJRestoreArrayRead(mat->A, &aav));
219     PetscCall(MatSeqAIJRestoreArrayRead(mat->B, &bav));
220     PetscFunctionReturn(PETSC_SUCCESS);
221   }
222   PetscCall(PetscMalloc1(M->rmap->n - cnt, &rows));
223   cnt = 0;
224   for (i = 0; i < m; i++) {
225     na = ia[i + 1] - ia[i];
226     nb = ib[i + 1] - ib[i];
227     if (!na && !nb) continue;
228     aa = aav + ia[i];
229     for (j = 0; j < na; j++) {
230       if (aa[j] != 0.0) {
231         rows[cnt++] = rstart + i;
232         goto ok2;
233       }
234     }
235     bb = PetscSafePointerPlusOffset(bav, ib[i]);
236     for (j = 0; j < nb; j++) {
237       if (bb[j] != 0.0) {
238         rows[cnt++] = rstart + i;
239         goto ok2;
240       }
241     }
242   ok2:;
243   }
244   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)M), cnt, rows, PETSC_OWN_POINTER, keptrows));
245   PetscCall(MatSeqAIJRestoreArrayRead(mat->A, &aav));
246   PetscCall(MatSeqAIJRestoreArrayRead(mat->B, &bav));
247   PetscFunctionReturn(PETSC_SUCCESS);
248 }
249 
250 static PetscErrorCode MatDiagonalSet_MPIAIJ(Mat Y, Vec D, InsertMode is)
251 {
252   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)Y->data;
253   PetscBool   cong;
254 
255   PetscFunctionBegin;
256   PetscCall(MatHasCongruentLayouts(Y, &cong));
257   if (Y->assembled && cong) {
258     PetscCall(MatDiagonalSet(aij->A, D, is));
259   } else {
260     PetscCall(MatDiagonalSet_Default(Y, D, is));
261   }
262   PetscFunctionReturn(PETSC_SUCCESS);
263 }
264 
265 static PetscErrorCode MatFindZeroDiagonals_MPIAIJ(Mat M, IS *zrows)
266 {
267   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)M->data;
268   PetscInt    i, rstart, nrows, *rows;
269 
270   PetscFunctionBegin;
271   *zrows = NULL;
272   PetscCall(MatFindZeroDiagonals_SeqAIJ_Private(aij->A, &nrows, &rows));
273   PetscCall(MatGetOwnershipRange(M, &rstart, NULL));
274   for (i = 0; i < nrows; i++) rows[i] += rstart;
275   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)M), nrows, rows, PETSC_OWN_POINTER, zrows));
276   PetscFunctionReturn(PETSC_SUCCESS);
277 }
278 
279 static PetscErrorCode MatGetColumnReductions_MPIAIJ(Mat A, PetscInt type, PetscReal *reductions)
280 {
281   Mat_MPIAIJ        *aij = (Mat_MPIAIJ *)A->data;
282   PetscInt           i, m, n, *garray = aij->garray;
283   Mat_SeqAIJ        *a_aij = (Mat_SeqAIJ *)aij->A->data;
284   Mat_SeqAIJ        *b_aij = (Mat_SeqAIJ *)aij->B->data;
285   PetscReal         *work;
286   const PetscScalar *dummy;
287   PetscMPIInt        in;
288 
289   PetscFunctionBegin;
290   PetscCall(MatGetSize(A, &m, &n));
291   PetscCall(PetscCalloc1(n, &work));
292   PetscCall(MatSeqAIJGetArrayRead(aij->A, &dummy));
293   PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &dummy));
294   PetscCall(MatSeqAIJGetArrayRead(aij->B, &dummy));
295   PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &dummy));
296   if (type == NORM_2) {
297     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscAbsScalar(a_aij->a[i] * a_aij->a[i]);
298     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscAbsScalar(b_aij->a[i] * b_aij->a[i]);
299   } else if (type == NORM_1) {
300     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscAbsScalar(a_aij->a[i]);
301     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscAbsScalar(b_aij->a[i]);
302   } else if (type == NORM_INFINITY) {
303     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] = PetscMax(PetscAbsScalar(a_aij->a[i]), work[A->cmap->rstart + a_aij->j[i]]);
304     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] = PetscMax(PetscAbsScalar(b_aij->a[i]), work[garray[b_aij->j[i]]]);
305   } else if (type == REDUCTION_SUM_REALPART || type == REDUCTION_MEAN_REALPART) {
306     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscRealPart(a_aij->a[i]);
307     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscRealPart(b_aij->a[i]);
308   } else if (type == REDUCTION_SUM_IMAGINARYPART || type == REDUCTION_MEAN_IMAGINARYPART) {
309     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscImaginaryPart(a_aij->a[i]);
310     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscImaginaryPart(b_aij->a[i]);
311   } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unknown reduction type");
312   PetscCall(PetscMPIIntCast(n, &in));
313   if (type == NORM_INFINITY) {
314     PetscCallMPI(MPIU_Allreduce(work, reductions, in, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)A)));
315   } else {
316     PetscCallMPI(MPIU_Allreduce(work, reductions, in, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)A)));
317   }
318   PetscCall(PetscFree(work));
319   if (type == NORM_2) {
320     for (i = 0; i < n; i++) reductions[i] = PetscSqrtReal(reductions[i]);
321   } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) {
322     for (i = 0; i < n; i++) reductions[i] /= m;
323   }
324   PetscFunctionReturn(PETSC_SUCCESS);
325 }
326 
327 static PetscErrorCode MatFindOffBlockDiagonalEntries_MPIAIJ(Mat A, IS *is)
328 {
329   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
330   IS              sis, gis;
331   const PetscInt *isis, *igis;
332   PetscInt        n, *iis, nsis, ngis, rstart, i;
333 
334   PetscFunctionBegin;
335   PetscCall(MatFindOffBlockDiagonalEntries(a->A, &sis));
336   PetscCall(MatFindNonzeroRows(a->B, &gis));
337   PetscCall(ISGetSize(gis, &ngis));
338   PetscCall(ISGetSize(sis, &nsis));
339   PetscCall(ISGetIndices(sis, &isis));
340   PetscCall(ISGetIndices(gis, &igis));
341 
342   PetscCall(PetscMalloc1(ngis + nsis, &iis));
343   PetscCall(PetscArraycpy(iis, igis, ngis));
344   PetscCall(PetscArraycpy(iis + ngis, isis, nsis));
345   n = ngis + nsis;
346   PetscCall(PetscSortRemoveDupsInt(&n, iis));
347   PetscCall(MatGetOwnershipRange(A, &rstart, NULL));
348   for (i = 0; i < n; i++) iis[i] += rstart;
349   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)A), n, iis, PETSC_OWN_POINTER, is));
350 
351   PetscCall(ISRestoreIndices(sis, &isis));
352   PetscCall(ISRestoreIndices(gis, &igis));
353   PetscCall(ISDestroy(&sis));
354   PetscCall(ISDestroy(&gis));
355   PetscFunctionReturn(PETSC_SUCCESS);
356 }
357 
358 /*
359   Local utility routine that creates a mapping from the global column
360 number to the local number in the off-diagonal part of the local
361 storage of the matrix.  When PETSC_USE_CTABLE is used this is scalable at
362 a slightly higher hash table cost; without it it is not scalable (each processor
363 has an order N integer array but is fast to access.
364 */
365 PetscErrorCode MatCreateColmap_MPIAIJ_Private(Mat mat)
366 {
367   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
368   PetscInt    n   = aij->B->cmap->n, i;
369 
370   PetscFunctionBegin;
371   PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
372 #if defined(PETSC_USE_CTABLE)
373   PetscCall(PetscHMapICreateWithSize(n, &aij->colmap));
374   for (i = 0; i < n; i++) PetscCall(PetscHMapISet(aij->colmap, aij->garray[i] + 1, i + 1));
375 #else
376   PetscCall(PetscCalloc1(mat->cmap->N + 1, &aij->colmap));
377   for (i = 0; i < n; i++) aij->colmap[aij->garray[i]] = i + 1;
378 #endif
379   PetscFunctionReturn(PETSC_SUCCESS);
380 }
381 
382 #define MatSetValues_SeqAIJ_A_Private(row, col, value, addv, orow, ocol) \
383   do { \
384     if (col <= lastcol1) low1 = 0; \
385     else high1 = nrow1; \
386     lastcol1 = col; \
387     while (high1 - low1 > 5) { \
388       t = (low1 + high1) / 2; \
389       if (rp1[t] > col) high1 = t; \
390       else low1 = t; \
391     } \
392     for (_i = low1; _i < high1; _i++) { \
393       if (rp1[_i] > col) break; \
394       if (rp1[_i] == col) { \
395         if (addv == ADD_VALUES) { \
396           ap1[_i] += value; \
397           /* Not sure LogFlops will slow dow the code or not */ \
398           (void)PetscLogFlops(1.0); \
399         } else ap1[_i] = value; \
400         goto a_noinsert; \
401       } \
402     } \
403     if (value == 0.0 && ignorezeroentries && row != col) { \
404       low1  = 0; \
405       high1 = nrow1; \
406       goto a_noinsert; \
407     } \
408     if (nonew == 1) { \
409       low1  = 0; \
410       high1 = nrow1; \
411       goto a_noinsert; \
412     } \
413     PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
414     MatSeqXAIJReallocateAIJ(A, am, 1, nrow1, row, col, rmax1, aa, ai, aj, rp1, ap1, aimax, nonew, MatScalar); \
415     N = nrow1++ - 1; \
416     a->nz++; \
417     high1++; \
418     /* shift up all the later entries in this row */ \
419     PetscCall(PetscArraymove(rp1 + _i + 1, rp1 + _i, N - _i + 1)); \
420     PetscCall(PetscArraymove(ap1 + _i + 1, ap1 + _i, N - _i + 1)); \
421     rp1[_i] = col; \
422     ap1[_i] = value; \
423   a_noinsert:; \
424     ailen[row] = nrow1; \
425   } while (0)
426 
427 #define MatSetValues_SeqAIJ_B_Private(row, col, value, addv, orow, ocol) \
428   do { \
429     if (col <= lastcol2) low2 = 0; \
430     else high2 = nrow2; \
431     lastcol2 = col; \
432     while (high2 - low2 > 5) { \
433       t = (low2 + high2) / 2; \
434       if (rp2[t] > col) high2 = t; \
435       else low2 = t; \
436     } \
437     for (_i = low2; _i < high2; _i++) { \
438       if (rp2[_i] > col) break; \
439       if (rp2[_i] == col) { \
440         if (addv == ADD_VALUES) { \
441           ap2[_i] += value; \
442           (void)PetscLogFlops(1.0); \
443         } else ap2[_i] = value; \
444         goto b_noinsert; \
445       } \
446     } \
447     if (value == 0.0 && ignorezeroentries) { \
448       low2  = 0; \
449       high2 = nrow2; \
450       goto b_noinsert; \
451     } \
452     if (nonew == 1) { \
453       low2  = 0; \
454       high2 = nrow2; \
455       goto b_noinsert; \
456     } \
457     PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
458     MatSeqXAIJReallocateAIJ(B, bm, 1, nrow2, row, col, rmax2, ba, bi, bj, rp2, ap2, bimax, nonew, MatScalar); \
459     N = nrow2++ - 1; \
460     b->nz++; \
461     high2++; \
462     /* shift up all the later entries in this row */ \
463     PetscCall(PetscArraymove(rp2 + _i + 1, rp2 + _i, N - _i + 1)); \
464     PetscCall(PetscArraymove(ap2 + _i + 1, ap2 + _i, N - _i + 1)); \
465     rp2[_i] = col; \
466     ap2[_i] = value; \
467   b_noinsert:; \
468     bilen[row] = nrow2; \
469   } while (0)
470 
471 static PetscErrorCode MatSetValuesRow_MPIAIJ(Mat A, PetscInt row, const PetscScalar v[])
472 {
473   Mat_MPIAIJ  *mat = (Mat_MPIAIJ *)A->data;
474   Mat_SeqAIJ  *a = (Mat_SeqAIJ *)mat->A->data, *b = (Mat_SeqAIJ *)mat->B->data;
475   PetscInt     l, *garray                         = mat->garray, diag;
476   PetscScalar *aa, *ba;
477 
478   PetscFunctionBegin;
479   /* code only works for square matrices A */
480 
481   /* find size of row to the left of the diagonal part */
482   PetscCall(MatGetOwnershipRange(A, &diag, NULL));
483   row = row - diag;
484   for (l = 0; l < b->i[row + 1] - b->i[row]; l++) {
485     if (garray[b->j[b->i[row] + l]] > diag) break;
486   }
487   if (l) {
488     PetscCall(MatSeqAIJGetArray(mat->B, &ba));
489     PetscCall(PetscArraycpy(ba + b->i[row], v, l));
490     PetscCall(MatSeqAIJRestoreArray(mat->B, &ba));
491   }
492 
493   /* diagonal part */
494   if (a->i[row + 1] - a->i[row]) {
495     PetscCall(MatSeqAIJGetArray(mat->A, &aa));
496     PetscCall(PetscArraycpy(aa + a->i[row], v + l, a->i[row + 1] - a->i[row]));
497     PetscCall(MatSeqAIJRestoreArray(mat->A, &aa));
498   }
499 
500   /* right of diagonal part */
501   if (b->i[row + 1] - b->i[row] - l) {
502     PetscCall(MatSeqAIJGetArray(mat->B, &ba));
503     PetscCall(PetscArraycpy(ba + b->i[row] + l, v + l + a->i[row + 1] - a->i[row], b->i[row + 1] - b->i[row] - l));
504     PetscCall(MatSeqAIJRestoreArray(mat->B, &ba));
505   }
506   PetscFunctionReturn(PETSC_SUCCESS);
507 }
508 
509 PetscErrorCode MatSetValues_MPIAIJ(Mat mat, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode addv)
510 {
511   Mat_MPIAIJ *aij   = (Mat_MPIAIJ *)mat->data;
512   PetscScalar value = 0.0;
513   PetscInt    i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
514   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
515   PetscBool   roworiented = aij->roworiented;
516 
517   /* Some Variables required in the macro */
518   Mat         A     = aij->A;
519   Mat_SeqAIJ *a     = (Mat_SeqAIJ *)A->data;
520   PetscInt   *aimax = a->imax, *ai = a->i, *ailen = a->ilen, *aj = a->j;
521   PetscBool   ignorezeroentries = a->ignorezeroentries;
522   Mat         B                 = aij->B;
523   Mat_SeqAIJ *b                 = (Mat_SeqAIJ *)B->data;
524   PetscInt   *bimax = b->imax, *bi = b->i, *bilen = b->ilen, *bj = b->j, bm = aij->B->rmap->n, am = aij->A->rmap->n;
525   MatScalar  *aa, *ba;
526   PetscInt   *rp1, *rp2, ii, nrow1, nrow2, _i, rmax1, rmax2, N, low1, high1, low2, high2, t, lastcol1, lastcol2;
527   PetscInt    nonew;
528   MatScalar  *ap1, *ap2;
529 
530   PetscFunctionBegin;
531   PetscCall(MatSeqAIJGetArray(A, &aa));
532   PetscCall(MatSeqAIJGetArray(B, &ba));
533   for (i = 0; i < m; i++) {
534     if (im[i] < 0) continue;
535     PetscCheck(im[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, im[i], mat->rmap->N - 1);
536     if (im[i] >= rstart && im[i] < rend) {
537       row      = im[i] - rstart;
538       lastcol1 = -1;
539       rp1      = PetscSafePointerPlusOffset(aj, ai[row]);
540       ap1      = PetscSafePointerPlusOffset(aa, ai[row]);
541       rmax1    = aimax[row];
542       nrow1    = ailen[row];
543       low1     = 0;
544       high1    = nrow1;
545       lastcol2 = -1;
546       rp2      = PetscSafePointerPlusOffset(bj, bi[row]);
547       ap2      = PetscSafePointerPlusOffset(ba, bi[row]);
548       rmax2    = bimax[row];
549       nrow2    = bilen[row];
550       low2     = 0;
551       high2    = nrow2;
552 
553       for (j = 0; j < n; j++) {
554         if (v) value = roworiented ? v[i * n + j] : v[i + j * m];
555         if (ignorezeroentries && value == 0.0 && (addv == ADD_VALUES) && im[i] != in[j]) continue;
556         if (in[j] >= cstart && in[j] < cend) {
557           col   = in[j] - cstart;
558           nonew = a->nonew;
559           MatSetValues_SeqAIJ_A_Private(row, col, value, addv, im[i], in[j]);
560         } else if (in[j] < 0) {
561           continue;
562         } else {
563           PetscCheck(in[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[j], mat->cmap->N - 1);
564           if (mat->was_assembled) {
565             if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
566 #if defined(PETSC_USE_CTABLE)
567             PetscCall(PetscHMapIGetWithDefault(aij->colmap, in[j] + 1, 0, &col)); /* map global col ids to local ones */
568             col--;
569 #else
570             col = aij->colmap[in[j]] - 1;
571 #endif
572             if (col < 0 && !((Mat_SeqAIJ *)aij->B->data)->nonew) { /* col < 0 means in[j] is a new col for B */
573               PetscCall(MatDisAssemble_MPIAIJ(mat));               /* Change aij->B from reduced/local format to expanded/global format */
574               col = in[j];
575               /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
576               B     = aij->B;
577               b     = (Mat_SeqAIJ *)B->data;
578               bimax = b->imax;
579               bi    = b->i;
580               bilen = b->ilen;
581               bj    = b->j;
582               ba    = b->a;
583               rp2   = PetscSafePointerPlusOffset(bj, bi[row]);
584               ap2   = PetscSafePointerPlusOffset(ba, bi[row]);
585               rmax2 = bimax[row];
586               nrow2 = bilen[row];
587               low2  = 0;
588               high2 = nrow2;
589               bm    = aij->B->rmap->n;
590               ba    = b->a;
591             } else if (col < 0 && !(ignorezeroentries && value == 0.0)) {
592               if (1 == ((Mat_SeqAIJ *)aij->B->data)->nonew) {
593                 PetscCall(PetscInfo(mat, "Skipping of insertion of new nonzero location in off-diagonal portion of matrix %g(%" PetscInt_FMT ",%" PetscInt_FMT ")\n", (double)PetscRealPart(value), im[i], in[j]));
594               } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", im[i], in[j]);
595             }
596           } else col = in[j];
597           nonew = b->nonew;
598           MatSetValues_SeqAIJ_B_Private(row, col, value, addv, im[i], in[j]);
599         }
600       }
601     } else {
602       PetscCheck(!mat->nooffprocentries, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Setting off process row %" PetscInt_FMT " even though MatSetOption(,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) was set", im[i]);
603       if (!aij->donotstash) {
604         mat->assembled = PETSC_FALSE;
605         if (roworiented) {
606           PetscCall(MatStashValuesRow_Private(&mat->stash, im[i], n, in, PetscSafePointerPlusOffset(v, i * n), (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
607         } else {
608           PetscCall(MatStashValuesCol_Private(&mat->stash, im[i], n, in, PetscSafePointerPlusOffset(v, i), m, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
609         }
610       }
611     }
612   }
613   PetscCall(MatSeqAIJRestoreArray(A, &aa)); /* aa, bb might have been free'd due to reallocation above. But we don't access them here */
614   PetscCall(MatSeqAIJRestoreArray(B, &ba));
615   PetscFunctionReturn(PETSC_SUCCESS);
616 }
617 
618 /*
619     This function sets the j and ilen arrays (of the diagonal and off-diagonal part) of an MPIAIJ-matrix.
620     The values in mat_i have to be sorted and the values in mat_j have to be sorted for each row (CSR-like).
621     No off-processor parts off the matrix are allowed here and mat->was_assembled has to be PETSC_FALSE.
622 */
623 PetscErrorCode MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(Mat mat, const PetscInt mat_j[], const PetscInt mat_i[])
624 {
625   Mat_MPIAIJ *aij    = (Mat_MPIAIJ *)mat->data;
626   Mat         A      = aij->A; /* diagonal part of the matrix */
627   Mat         B      = aij->B; /* off-diagonal part of the matrix */
628   Mat_SeqAIJ *a      = (Mat_SeqAIJ *)A->data;
629   Mat_SeqAIJ *b      = (Mat_SeqAIJ *)B->data;
630   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, col;
631   PetscInt   *ailen = a->ilen, *aj = a->j;
632   PetscInt   *bilen = b->ilen, *bj = b->j;
633   PetscInt    am          = aij->A->rmap->n, j;
634   PetscInt    diag_so_far = 0, dnz;
635   PetscInt    offd_so_far = 0, onz;
636 
637   PetscFunctionBegin;
638   /* Iterate over all rows of the matrix */
639   for (j = 0; j < am; j++) {
640     dnz = onz = 0;
641     /*  Iterate over all non-zero columns of the current row */
642     for (col = mat_i[j]; col < mat_i[j + 1]; col++) {
643       /* If column is in the diagonal */
644       if (mat_j[col] >= cstart && mat_j[col] < cend) {
645         aj[diag_so_far++] = mat_j[col] - cstart;
646         dnz++;
647       } else { /* off-diagonal entries */
648         bj[offd_so_far++] = mat_j[col];
649         onz++;
650       }
651     }
652     ailen[j] = dnz;
653     bilen[j] = onz;
654   }
655   PetscFunctionReturn(PETSC_SUCCESS);
656 }
657 
658 /*
659     This function sets the local j, a and ilen arrays (of the diagonal and off-diagonal part) of an MPIAIJ-matrix.
660     The values in mat_i have to be sorted and the values in mat_j have to be sorted for each row (CSR-like).
661     No off-processor parts off the matrix are allowed here, they are set at a later point by MatSetValues_MPIAIJ.
662     Also, mat->was_assembled has to be false, otherwise the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart;
663     would not be true and the more complex MatSetValues_MPIAIJ has to be used.
664 */
665 PetscErrorCode MatSetValues_MPIAIJ_CopyFromCSRFormat(Mat mat, const PetscInt mat_j[], const PetscInt mat_i[], const PetscScalar mat_a[])
666 {
667   Mat_MPIAIJ  *aij  = (Mat_MPIAIJ *)mat->data;
668   Mat          A    = aij->A; /* diagonal part of the matrix */
669   Mat          B    = aij->B; /* off-diagonal part of the matrix */
670   Mat_SeqAIJ  *aijd = (Mat_SeqAIJ *)aij->A->data, *aijo = (Mat_SeqAIJ *)aij->B->data;
671   Mat_SeqAIJ  *a      = (Mat_SeqAIJ *)A->data;
672   Mat_SeqAIJ  *b      = (Mat_SeqAIJ *)B->data;
673   PetscInt     cstart = mat->cmap->rstart, cend = mat->cmap->rend;
674   PetscInt    *ailen = a->ilen, *aj = a->j;
675   PetscInt    *bilen = b->ilen, *bj = b->j;
676   PetscInt     am          = aij->A->rmap->n, j;
677   PetscInt    *full_diag_i = aijd->i, *full_offd_i = aijo->i; /* These variables can also include non-local elements, which are set at a later point. */
678   PetscInt     col, dnz_row, onz_row, rowstart_diag, rowstart_offd;
679   PetscScalar *aa = a->a, *ba = b->a;
680 
681   PetscFunctionBegin;
682   /* Iterate over all rows of the matrix */
683   for (j = 0; j < am; j++) {
684     dnz_row = onz_row = 0;
685     rowstart_offd     = full_offd_i[j];
686     rowstart_diag     = full_diag_i[j];
687     /*  Iterate over all non-zero columns of the current row */
688     for (col = mat_i[j]; col < mat_i[j + 1]; col++) {
689       /* If column is in the diagonal */
690       if (mat_j[col] >= cstart && mat_j[col] < cend) {
691         aj[rowstart_diag + dnz_row] = mat_j[col] - cstart;
692         aa[rowstart_diag + dnz_row] = mat_a[col];
693         dnz_row++;
694       } else { /* off-diagonal entries */
695         bj[rowstart_offd + onz_row] = mat_j[col];
696         ba[rowstart_offd + onz_row] = mat_a[col];
697         onz_row++;
698       }
699     }
700     ailen[j] = dnz_row;
701     bilen[j] = onz_row;
702   }
703   PetscFunctionReturn(PETSC_SUCCESS);
704 }
705 
706 static PetscErrorCode MatGetValues_MPIAIJ(Mat mat, PetscInt m, const PetscInt idxm[], PetscInt n, const PetscInt idxn[], PetscScalar v[])
707 {
708   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
709   PetscInt    i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
710   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
711 
712   PetscFunctionBegin;
713   for (i = 0; i < m; i++) {
714     if (idxm[i] < 0) continue; /* negative row */
715     PetscCheck(idxm[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, idxm[i], mat->rmap->N - 1);
716     PetscCheck(idxm[i] >= rstart && idxm[i] < rend, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only local values currently supported, row requested %" PetscInt_FMT " range [%" PetscInt_FMT " %" PetscInt_FMT ")", idxm[i], rstart, rend);
717     row = idxm[i] - rstart;
718     for (j = 0; j < n; j++) {
719       if (idxn[j] < 0) continue; /* negative column */
720       PetscCheck(idxn[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, idxn[j], mat->cmap->N - 1);
721       if (idxn[j] >= cstart && idxn[j] < cend) {
722         col = idxn[j] - cstart;
723         PetscCall(MatGetValues(aij->A, 1, &row, 1, &col, v + i * n + j));
724       } else {
725         if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
726 #if defined(PETSC_USE_CTABLE)
727         PetscCall(PetscHMapIGetWithDefault(aij->colmap, idxn[j] + 1, 0, &col));
728         col--;
729 #else
730         col = aij->colmap[idxn[j]] - 1;
731 #endif
732         if ((col < 0) || (aij->garray[col] != idxn[j])) *(v + i * n + j) = 0.0;
733         else PetscCall(MatGetValues(aij->B, 1, &row, 1, &col, v + i * n + j));
734       }
735     }
736   }
737   PetscFunctionReturn(PETSC_SUCCESS);
738 }
739 
740 static PetscErrorCode MatAssemblyBegin_MPIAIJ(Mat mat, MatAssemblyType mode)
741 {
742   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
743   PetscInt    nstash, reallocs;
744 
745   PetscFunctionBegin;
746   if (aij->donotstash || mat->nooffprocentries) PetscFunctionReturn(PETSC_SUCCESS);
747 
748   PetscCall(MatStashScatterBegin_Private(mat, &mat->stash, mat->rmap->range));
749   PetscCall(MatStashGetInfo_Private(&mat->stash, &nstash, &reallocs));
750   PetscCall(PetscInfo(aij->A, "Stash has %" PetscInt_FMT " entries, uses %" PetscInt_FMT " mallocs.\n", nstash, reallocs));
751   PetscFunctionReturn(PETSC_SUCCESS);
752 }
753 
754 PetscErrorCode MatAssemblyEnd_MPIAIJ(Mat mat, MatAssemblyType mode)
755 {
756   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *)mat->data;
757   PetscMPIInt  n;
758   PetscInt     i, j, rstart, ncols, flg;
759   PetscInt    *row, *col;
760   PetscBool    other_disassembled;
761   PetscScalar *val;
762 
763   /* do not use 'b = (Mat_SeqAIJ*)aij->B->data' as B can be reset in disassembly */
764 
765   PetscFunctionBegin;
766   if (!aij->donotstash && !mat->nooffprocentries) {
767     while (1) {
768       PetscCall(MatStashScatterGetMesg_Private(&mat->stash, &n, &row, &col, &val, &flg));
769       if (!flg) break;
770 
771       for (i = 0; i < n;) {
772         /* Now identify the consecutive vals belonging to the same row */
773         for (j = i, rstart = row[j]; j < n; j++) {
774           if (row[j] != rstart) break;
775         }
776         if (j < n) ncols = j - i;
777         else ncols = n - i;
778         /* Now assemble all these values with a single function call */
779         PetscCall(MatSetValues_MPIAIJ(mat, 1, row + i, ncols, col + i, val + i, mat->insertmode));
780         i = j;
781       }
782     }
783     PetscCall(MatStashScatterEnd_Private(&mat->stash));
784   }
785 #if defined(PETSC_HAVE_DEVICE)
786   if (mat->offloadmask == PETSC_OFFLOAD_CPU) aij->A->offloadmask = PETSC_OFFLOAD_CPU;
787   /* We call MatBindToCPU() on aij->A and aij->B here, because if MatBindToCPU_MPIAIJ() is called before assembly, it cannot bind these. */
788   if (mat->boundtocpu) {
789     PetscCall(MatBindToCPU(aij->A, PETSC_TRUE));
790     PetscCall(MatBindToCPU(aij->B, PETSC_TRUE));
791   }
792 #endif
793   PetscCall(MatAssemblyBegin(aij->A, mode));
794   PetscCall(MatAssemblyEnd(aij->A, mode));
795 
796   /* determine if any processor has disassembled, if so we must
797      also disassemble ourself, in order that we may reassemble. */
798   /*
799      if nonzero structure of submatrix B cannot change then we know that
800      no processor disassembled thus we can skip this stuff
801   */
802   if (!((Mat_SeqAIJ *)aij->B->data)->nonew) {
803     PetscCallMPI(MPIU_Allreduce(&mat->was_assembled, &other_disassembled, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
804     if (mat->was_assembled && !other_disassembled) { /* mat on this rank has reduced off-diag B with local col ids, but globally it does not */
805       PetscCall(MatDisAssemble_MPIAIJ(mat));
806     }
807   }
808   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) PetscCall(MatSetUpMultiply_MPIAIJ(mat));
809   PetscCall(MatSetOption(aij->B, MAT_USE_INODES, PETSC_FALSE));
810 #if defined(PETSC_HAVE_DEVICE)
811   if (mat->offloadmask == PETSC_OFFLOAD_CPU && aij->B->offloadmask != PETSC_OFFLOAD_UNALLOCATED) aij->B->offloadmask = PETSC_OFFLOAD_CPU;
812 #endif
813   PetscCall(MatAssemblyBegin(aij->B, mode));
814   PetscCall(MatAssemblyEnd(aij->B, mode));
815 
816   PetscCall(PetscFree2(aij->rowvalues, aij->rowindices));
817 
818   aij->rowvalues = NULL;
819 
820   PetscCall(VecDestroy(&aij->diag));
821 
822   /* if no new nonzero locations are allowed in matrix then only set the matrix state the first time through */
823   if ((!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) || !((Mat_SeqAIJ *)aij->A->data)->nonew) {
824     PetscObjectState state = aij->A->nonzerostate + aij->B->nonzerostate;
825     PetscCallMPI(MPIU_Allreduce(&state, &mat->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)mat)));
826   }
827 #if defined(PETSC_HAVE_DEVICE)
828   mat->offloadmask = PETSC_OFFLOAD_BOTH;
829 #endif
830   PetscFunctionReturn(PETSC_SUCCESS);
831 }
832 
833 static PetscErrorCode MatZeroEntries_MPIAIJ(Mat A)
834 {
835   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
836 
837   PetscFunctionBegin;
838   PetscCall(MatZeroEntries(l->A));
839   PetscCall(MatZeroEntries(l->B));
840   PetscFunctionReturn(PETSC_SUCCESS);
841 }
842 
843 static PetscErrorCode MatZeroRows_MPIAIJ(Mat A, PetscInt N, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
844 {
845   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)A->data;
846   PetscInt   *lrows;
847   PetscInt    r, len;
848   PetscBool   cong;
849 
850   PetscFunctionBegin;
851   /* get locally owned rows */
852   PetscCall(MatZeroRowsMapLocal_Private(A, N, rows, &len, &lrows));
853   PetscCall(MatHasCongruentLayouts(A, &cong));
854   /* fix right-hand side if needed */
855   if (x && b) {
856     const PetscScalar *xx;
857     PetscScalar       *bb;
858 
859     PetscCheck(cong, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need matching row/col layout");
860     PetscCall(VecGetArrayRead(x, &xx));
861     PetscCall(VecGetArray(b, &bb));
862     for (r = 0; r < len; ++r) bb[lrows[r]] = diag * xx[lrows[r]];
863     PetscCall(VecRestoreArrayRead(x, &xx));
864     PetscCall(VecRestoreArray(b, &bb));
865   }
866 
867   if (diag != 0.0 && cong) {
868     PetscCall(MatZeroRows(mat->A, len, lrows, diag, NULL, NULL));
869     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
870   } else if (diag != 0.0) { /* non-square or non congruent layouts -> if keepnonzeropattern is false, we allow for new insertion */
871     Mat_SeqAIJ *aijA = (Mat_SeqAIJ *)mat->A->data;
872     Mat_SeqAIJ *aijB = (Mat_SeqAIJ *)mat->B->data;
873     PetscInt    nnwA, nnwB;
874     PetscBool   nnzA, nnzB;
875 
876     nnwA = aijA->nonew;
877     nnwB = aijB->nonew;
878     nnzA = aijA->keepnonzeropattern;
879     nnzB = aijB->keepnonzeropattern;
880     if (!nnzA) {
881       PetscCall(PetscInfo(mat->A, "Requested to not keep the pattern and add a nonzero diagonal; may encounter reallocations on diagonal block.\n"));
882       aijA->nonew = 0;
883     }
884     if (!nnzB) {
885       PetscCall(PetscInfo(mat->B, "Requested to not keep the pattern and add a nonzero diagonal; may encounter reallocations on off-diagonal block.\n"));
886       aijB->nonew = 0;
887     }
888     /* Must zero here before the next loop */
889     PetscCall(MatZeroRows(mat->A, len, lrows, 0.0, NULL, NULL));
890     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
891     for (r = 0; r < len; ++r) {
892       const PetscInt row = lrows[r] + A->rmap->rstart;
893       if (row >= A->cmap->N) continue;
894       PetscCall(MatSetValues(A, 1, &row, 1, &row, &diag, INSERT_VALUES));
895     }
896     aijA->nonew = nnwA;
897     aijB->nonew = nnwB;
898   } else {
899     PetscCall(MatZeroRows(mat->A, len, lrows, 0.0, NULL, NULL));
900     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
901   }
902   PetscCall(PetscFree(lrows));
903   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
904   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
905 
906   /* only change matrix nonzero state if pattern was allowed to be changed */
907   if (!((Mat_SeqAIJ *)mat->A->data)->keepnonzeropattern || !((Mat_SeqAIJ *)mat->A->data)->nonew) {
908     PetscObjectState state = mat->A->nonzerostate + mat->B->nonzerostate;
909     PetscCallMPI(MPIU_Allreduce(&state, &A->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)A)));
910   }
911   PetscFunctionReturn(PETSC_SUCCESS);
912 }
913 
914 static PetscErrorCode MatZeroRowsColumns_MPIAIJ(Mat A, PetscInt N, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
915 {
916   Mat_MPIAIJ        *l = (Mat_MPIAIJ *)A->data;
917   PetscInt           n = A->rmap->n;
918   PetscInt           i, j, r, m, len = 0;
919   PetscInt          *lrows, *owners = A->rmap->range;
920   PetscMPIInt        p = 0;
921   PetscSFNode       *rrows;
922   PetscSF            sf;
923   const PetscScalar *xx;
924   PetscScalar       *bb, *mask, *aij_a;
925   Vec                xmask, lmask;
926   Mat_SeqAIJ        *aij = (Mat_SeqAIJ *)l->B->data;
927   const PetscInt    *aj, *ii, *ridx;
928   PetscScalar       *aa;
929 
930   PetscFunctionBegin;
931   /* Create SF where leaves are input rows and roots are owned rows */
932   PetscCall(PetscMalloc1(n, &lrows));
933   for (r = 0; r < n; ++r) lrows[r] = -1;
934   PetscCall(PetscMalloc1(N, &rrows));
935   for (r = 0; r < N; ++r) {
936     const PetscInt idx = rows[r];
937     PetscCheck(idx >= 0 && A->rmap->N > idx, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range [0,%" PetscInt_FMT ")", idx, A->rmap->N);
938     if (idx < owners[p] || owners[p + 1] <= idx) { /* short-circuit the search if the last p owns this row too */
939       PetscCall(PetscLayoutFindOwner(A->rmap, idx, &p));
940     }
941     rrows[r].rank  = p;
942     rrows[r].index = rows[r] - owners[p];
943   }
944   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
945   PetscCall(PetscSFSetGraph(sf, n, N, NULL, PETSC_OWN_POINTER, rrows, PETSC_OWN_POINTER));
946   /* Collect flags for rows to be zeroed */
947   PetscCall(PetscSFReduceBegin(sf, MPIU_INT, (PetscInt *)rows, lrows, MPI_LOR));
948   PetscCall(PetscSFReduceEnd(sf, MPIU_INT, (PetscInt *)rows, lrows, MPI_LOR));
949   PetscCall(PetscSFDestroy(&sf));
950   /* Compress and put in row numbers */
951   for (r = 0; r < n; ++r)
952     if (lrows[r] >= 0) lrows[len++] = r;
953   /* zero diagonal part of matrix */
954   PetscCall(MatZeroRowsColumns(l->A, len, lrows, diag, x, b));
955   /* handle off-diagonal part of matrix */
956   PetscCall(MatCreateVecs(A, &xmask, NULL));
957   PetscCall(VecDuplicate(l->lvec, &lmask));
958   PetscCall(VecGetArray(xmask, &bb));
959   for (i = 0; i < len; i++) bb[lrows[i]] = 1;
960   PetscCall(VecRestoreArray(xmask, &bb));
961   PetscCall(VecScatterBegin(l->Mvctx, xmask, lmask, ADD_VALUES, SCATTER_FORWARD));
962   PetscCall(VecScatterEnd(l->Mvctx, xmask, lmask, ADD_VALUES, SCATTER_FORWARD));
963   PetscCall(VecDestroy(&xmask));
964   if (x && b) { /* this code is buggy when the row and column layout don't match */
965     PetscBool cong;
966 
967     PetscCall(MatHasCongruentLayouts(A, &cong));
968     PetscCheck(cong, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need matching row/col layout");
969     PetscCall(VecScatterBegin(l->Mvctx, x, l->lvec, INSERT_VALUES, SCATTER_FORWARD));
970     PetscCall(VecScatterEnd(l->Mvctx, x, l->lvec, INSERT_VALUES, SCATTER_FORWARD));
971     PetscCall(VecGetArrayRead(l->lvec, &xx));
972     PetscCall(VecGetArray(b, &bb));
973   }
974   PetscCall(VecGetArray(lmask, &mask));
975   /* remove zeroed rows of off-diagonal matrix */
976   PetscCall(MatSeqAIJGetArray(l->B, &aij_a));
977   ii = aij->i;
978   for (i = 0; i < len; i++) PetscCall(PetscArrayzero(PetscSafePointerPlusOffset(aij_a, ii[lrows[i]]), ii[lrows[i] + 1] - ii[lrows[i]]));
979   /* loop over all elements of off process part of matrix zeroing removed columns*/
980   if (aij->compressedrow.use) {
981     m    = aij->compressedrow.nrows;
982     ii   = aij->compressedrow.i;
983     ridx = aij->compressedrow.rindex;
984     for (i = 0; i < m; i++) {
985       n  = ii[i + 1] - ii[i];
986       aj = aij->j + ii[i];
987       aa = aij_a + ii[i];
988 
989       for (j = 0; j < n; j++) {
990         if (PetscAbsScalar(mask[*aj])) {
991           if (b) bb[*ridx] -= *aa * xx[*aj];
992           *aa = 0.0;
993         }
994         aa++;
995         aj++;
996       }
997       ridx++;
998     }
999   } else { /* do not use compressed row format */
1000     m = l->B->rmap->n;
1001     for (i = 0; i < m; i++) {
1002       n  = ii[i + 1] - ii[i];
1003       aj = aij->j + ii[i];
1004       aa = aij_a + ii[i];
1005       for (j = 0; j < n; j++) {
1006         if (PetscAbsScalar(mask[*aj])) {
1007           if (b) bb[i] -= *aa * xx[*aj];
1008           *aa = 0.0;
1009         }
1010         aa++;
1011         aj++;
1012       }
1013     }
1014   }
1015   if (x && b) {
1016     PetscCall(VecRestoreArray(b, &bb));
1017     PetscCall(VecRestoreArrayRead(l->lvec, &xx));
1018   }
1019   PetscCall(MatSeqAIJRestoreArray(l->B, &aij_a));
1020   PetscCall(VecRestoreArray(lmask, &mask));
1021   PetscCall(VecDestroy(&lmask));
1022   PetscCall(PetscFree(lrows));
1023 
1024   /* only change matrix nonzero state if pattern was allowed to be changed */
1025   if (!((Mat_SeqAIJ *)l->A->data)->nonew) {
1026     PetscObjectState state = l->A->nonzerostate + l->B->nonzerostate;
1027     PetscCallMPI(MPIU_Allreduce(&state, &A->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)A)));
1028   }
1029   PetscFunctionReturn(PETSC_SUCCESS);
1030 }
1031 
1032 static PetscErrorCode MatMult_MPIAIJ(Mat A, Vec xx, Vec yy)
1033 {
1034   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1035   PetscInt    nt;
1036   VecScatter  Mvctx = a->Mvctx;
1037 
1038   PetscFunctionBegin;
1039   PetscCall(VecGetLocalSize(xx, &nt));
1040   PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt);
1041   PetscCall(VecScatterBegin(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1042   PetscUseTypeMethod(a->A, mult, xx, yy);
1043   PetscCall(VecScatterEnd(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1044   PetscUseTypeMethod(a->B, multadd, a->lvec, yy, yy);
1045   PetscFunctionReturn(PETSC_SUCCESS);
1046 }
1047 
1048 static PetscErrorCode MatMultDiagonalBlock_MPIAIJ(Mat A, Vec bb, Vec xx)
1049 {
1050   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1051 
1052   PetscFunctionBegin;
1053   PetscCall(MatMultDiagonalBlock(a->A, bb, xx));
1054   PetscFunctionReturn(PETSC_SUCCESS);
1055 }
1056 
1057 static PetscErrorCode MatMultAdd_MPIAIJ(Mat A, Vec xx, Vec yy, Vec zz)
1058 {
1059   Mat_MPIAIJ *a     = (Mat_MPIAIJ *)A->data;
1060   VecScatter  Mvctx = a->Mvctx;
1061 
1062   PetscFunctionBegin;
1063   PetscCall(VecScatterBegin(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1064   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
1065   PetscCall(VecScatterEnd(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1066   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
1067   PetscFunctionReturn(PETSC_SUCCESS);
1068 }
1069 
1070 static PetscErrorCode MatMultTranspose_MPIAIJ(Mat A, Vec xx, Vec yy)
1071 {
1072   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1073 
1074   PetscFunctionBegin;
1075   /* do nondiagonal part */
1076   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
1077   /* do local part */
1078   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
1079   /* add partial results together */
1080   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
1081   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
1082   PetscFunctionReturn(PETSC_SUCCESS);
1083 }
1084 
1085 static PetscErrorCode MatIsTranspose_MPIAIJ(Mat Amat, Mat Bmat, PetscReal tol, PetscBool *f)
1086 {
1087   MPI_Comm    comm;
1088   Mat_MPIAIJ *Aij = (Mat_MPIAIJ *)Amat->data, *Bij = (Mat_MPIAIJ *)Bmat->data;
1089   Mat         Adia = Aij->A, Bdia = Bij->A, Aoff, Boff, *Aoffs, *Boffs;
1090   IS          Me, Notme;
1091   PetscInt    M, N, first, last, *notme, i;
1092   PetscBool   lf;
1093   PetscMPIInt size;
1094 
1095   PetscFunctionBegin;
1096   /* Easy test: symmetric diagonal block */
1097   PetscCall(MatIsTranspose(Adia, Bdia, tol, &lf));
1098   PetscCallMPI(MPIU_Allreduce(&lf, f, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)Amat)));
1099   if (!*f) PetscFunctionReturn(PETSC_SUCCESS);
1100   PetscCall(PetscObjectGetComm((PetscObject)Amat, &comm));
1101   PetscCallMPI(MPI_Comm_size(comm, &size));
1102   if (size == 1) PetscFunctionReturn(PETSC_SUCCESS);
1103 
1104   /* Hard test: off-diagonal block. This takes a MatCreateSubMatrix. */
1105   PetscCall(MatGetSize(Amat, &M, &N));
1106   PetscCall(MatGetOwnershipRange(Amat, &first, &last));
1107   PetscCall(PetscMalloc1(N - last + first, &notme));
1108   for (i = 0; i < first; i++) notme[i] = i;
1109   for (i = last; i < M; i++) notme[i - last + first] = i;
1110   PetscCall(ISCreateGeneral(MPI_COMM_SELF, N - last + first, notme, PETSC_COPY_VALUES, &Notme));
1111   PetscCall(ISCreateStride(MPI_COMM_SELF, last - first, first, 1, &Me));
1112   PetscCall(MatCreateSubMatrices(Amat, 1, &Me, &Notme, MAT_INITIAL_MATRIX, &Aoffs));
1113   Aoff = Aoffs[0];
1114   PetscCall(MatCreateSubMatrices(Bmat, 1, &Notme, &Me, MAT_INITIAL_MATRIX, &Boffs));
1115   Boff = Boffs[0];
1116   PetscCall(MatIsTranspose(Aoff, Boff, tol, f));
1117   PetscCall(MatDestroyMatrices(1, &Aoffs));
1118   PetscCall(MatDestroyMatrices(1, &Boffs));
1119   PetscCall(ISDestroy(&Me));
1120   PetscCall(ISDestroy(&Notme));
1121   PetscCall(PetscFree(notme));
1122   PetscFunctionReturn(PETSC_SUCCESS);
1123 }
1124 
1125 static PetscErrorCode MatMultTransposeAdd_MPIAIJ(Mat A, Vec xx, Vec yy, Vec zz)
1126 {
1127   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1128 
1129   PetscFunctionBegin;
1130   /* do nondiagonal part */
1131   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
1132   /* do local part */
1133   PetscCall((*a->A->ops->multtransposeadd)(a->A, xx, yy, zz));
1134   /* add partial results together */
1135   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, zz, ADD_VALUES, SCATTER_REVERSE));
1136   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, zz, ADD_VALUES, SCATTER_REVERSE));
1137   PetscFunctionReturn(PETSC_SUCCESS);
1138 }
1139 
1140 /*
1141   This only works correctly for square matrices where the subblock A->A is the
1142    diagonal block
1143 */
1144 static PetscErrorCode MatGetDiagonal_MPIAIJ(Mat A, Vec v)
1145 {
1146   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1147 
1148   PetscFunctionBegin;
1149   PetscCheck(A->rmap->N == A->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Supports only square matrix where A->A is diag block");
1150   PetscCheck(A->rmap->rstart == A->cmap->rstart && A->rmap->rend == A->cmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "row partition must equal col partition");
1151   PetscCall(MatGetDiagonal(a->A, v));
1152   PetscFunctionReturn(PETSC_SUCCESS);
1153 }
1154 
1155 static PetscErrorCode MatScale_MPIAIJ(Mat A, PetscScalar aa)
1156 {
1157   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1158 
1159   PetscFunctionBegin;
1160   PetscCall(MatScale(a->A, aa));
1161   PetscCall(MatScale(a->B, aa));
1162   PetscFunctionReturn(PETSC_SUCCESS);
1163 }
1164 
1165 static PetscErrorCode MatView_MPIAIJ_Binary(Mat mat, PetscViewer viewer)
1166 {
1167   Mat_MPIAIJ        *aij    = (Mat_MPIAIJ *)mat->data;
1168   Mat_SeqAIJ        *A      = (Mat_SeqAIJ *)aij->A->data;
1169   Mat_SeqAIJ        *B      = (Mat_SeqAIJ *)aij->B->data;
1170   const PetscInt    *garray = aij->garray;
1171   const PetscScalar *aa, *ba;
1172   PetscInt           header[4], M, N, m, rs, cs, cnt, i, ja, jb;
1173   PetscInt64         nz, hnz;
1174   PetscInt          *rowlens;
1175   PetscInt          *colidxs;
1176   PetscScalar       *matvals;
1177   PetscMPIInt        rank;
1178 
1179   PetscFunctionBegin;
1180   PetscCall(PetscViewerSetUp(viewer));
1181 
1182   M  = mat->rmap->N;
1183   N  = mat->cmap->N;
1184   m  = mat->rmap->n;
1185   rs = mat->rmap->rstart;
1186   cs = mat->cmap->rstart;
1187   nz = A->nz + B->nz;
1188 
1189   /* write matrix header */
1190   header[0] = MAT_FILE_CLASSID;
1191   header[1] = M;
1192   header[2] = N;
1193   PetscCallMPI(MPI_Reduce(&nz, &hnz, 1, MPIU_INT64, MPI_SUM, 0, PetscObjectComm((PetscObject)mat)));
1194   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
1195   if (rank == 0) PetscCall(PetscIntCast(hnz, &header[3]));
1196   PetscCall(PetscViewerBinaryWrite(viewer, header, 4, PETSC_INT));
1197 
1198   /* fill in and store row lengths  */
1199   PetscCall(PetscMalloc1(m, &rowlens));
1200   for (i = 0; i < m; i++) rowlens[i] = A->i[i + 1] - A->i[i] + B->i[i + 1] - B->i[i];
1201   PetscCall(PetscViewerBinaryWriteAll(viewer, rowlens, m, rs, M, PETSC_INT));
1202   PetscCall(PetscFree(rowlens));
1203 
1204   /* fill in and store column indices */
1205   PetscCall(PetscMalloc1(nz, &colidxs));
1206   for (cnt = 0, i = 0; i < m; i++) {
1207     for (jb = B->i[i]; jb < B->i[i + 1]; jb++) {
1208       if (garray[B->j[jb]] > cs) break;
1209       colidxs[cnt++] = garray[B->j[jb]];
1210     }
1211     for (ja = A->i[i]; ja < A->i[i + 1]; ja++) colidxs[cnt++] = A->j[ja] + cs;
1212     for (; jb < B->i[i + 1]; jb++) colidxs[cnt++] = garray[B->j[jb]];
1213   }
1214   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt64_FMT, cnt, nz);
1215   PetscCall(PetscViewerBinaryWriteAll(viewer, colidxs, nz, PETSC_DETERMINE, PETSC_DETERMINE, PETSC_INT));
1216   PetscCall(PetscFree(colidxs));
1217 
1218   /* fill in and store nonzero values */
1219   PetscCall(MatSeqAIJGetArrayRead(aij->A, &aa));
1220   PetscCall(MatSeqAIJGetArrayRead(aij->B, &ba));
1221   PetscCall(PetscMalloc1(nz, &matvals));
1222   for (cnt = 0, i = 0; i < m; i++) {
1223     for (jb = B->i[i]; jb < B->i[i + 1]; jb++) {
1224       if (garray[B->j[jb]] > cs) break;
1225       matvals[cnt++] = ba[jb];
1226     }
1227     for (ja = A->i[i]; ja < A->i[i + 1]; ja++) matvals[cnt++] = aa[ja];
1228     for (; jb < B->i[i + 1]; jb++) matvals[cnt++] = ba[jb];
1229   }
1230   PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &aa));
1231   PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &ba));
1232   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_LIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt64_FMT, cnt, nz);
1233   PetscCall(PetscViewerBinaryWriteAll(viewer, matvals, nz, PETSC_DETERMINE, PETSC_DETERMINE, PETSC_SCALAR));
1234   PetscCall(PetscFree(matvals));
1235 
1236   /* write block size option to the viewer's .info file */
1237   PetscCall(MatView_Binary_BlockSizes(mat, viewer));
1238   PetscFunctionReturn(PETSC_SUCCESS);
1239 }
1240 
1241 #include <petscdraw.h>
1242 static PetscErrorCode MatView_MPIAIJ_ASCIIorDraworSocket(Mat mat, PetscViewer viewer)
1243 {
1244   Mat_MPIAIJ       *aij  = (Mat_MPIAIJ *)mat->data;
1245   PetscMPIInt       rank = aij->rank, size = aij->size;
1246   PetscBool         isdraw, iascii, isbinary;
1247   PetscViewer       sviewer;
1248   PetscViewerFormat format;
1249 
1250   PetscFunctionBegin;
1251   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1252   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1253   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1254   if (iascii) {
1255     PetscCall(PetscViewerGetFormat(viewer, &format));
1256     if (format == PETSC_VIEWER_LOAD_BALANCE) {
1257       PetscInt i, nmax = 0, nmin = PETSC_INT_MAX, navg = 0, *nz, nzlocal = ((Mat_SeqAIJ *)aij->A->data)->nz + ((Mat_SeqAIJ *)aij->B->data)->nz;
1258       PetscCall(PetscMalloc1(size, &nz));
1259       PetscCallMPI(MPI_Allgather(&nzlocal, 1, MPIU_INT, nz, 1, MPIU_INT, PetscObjectComm((PetscObject)mat)));
1260       for (i = 0; i < size; i++) {
1261         nmax = PetscMax(nmax, nz[i]);
1262         nmin = PetscMin(nmin, nz[i]);
1263         navg += nz[i];
1264       }
1265       PetscCall(PetscFree(nz));
1266       navg = navg / size;
1267       PetscCall(PetscViewerASCIIPrintf(viewer, "Load Balance - Nonzeros: Min %" PetscInt_FMT "  avg %" PetscInt_FMT "  max %" PetscInt_FMT "\n", nmin, navg, nmax));
1268       PetscFunctionReturn(PETSC_SUCCESS);
1269     }
1270     PetscCall(PetscViewerGetFormat(viewer, &format));
1271     if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
1272       MatInfo   info;
1273       PetscInt *inodes = NULL;
1274 
1275       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
1276       PetscCall(MatGetInfo(mat, MAT_LOCAL, &info));
1277       PetscCall(MatInodeGetInodeSizes(aij->A, NULL, &inodes, NULL));
1278       PetscCall(PetscViewerASCIIPushSynchronized(viewer));
1279       if (!inodes) {
1280         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %g, not using I-node routines\n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated,
1281                                                      info.memory));
1282       } else {
1283         PetscCall(
1284           PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %g, using I-node routines\n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated, info.memory));
1285       }
1286       PetscCall(MatGetInfo(aij->A, MAT_LOCAL, &info));
1287       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] on-diagonal part: nz %" PetscInt_FMT " \n", rank, (PetscInt)info.nz_used));
1288       PetscCall(MatGetInfo(aij->B, MAT_LOCAL, &info));
1289       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] off-diagonal part: nz %" PetscInt_FMT " \n", rank, (PetscInt)info.nz_used));
1290       PetscCall(PetscViewerFlush(viewer));
1291       PetscCall(PetscViewerASCIIPopSynchronized(viewer));
1292       PetscCall(PetscViewerASCIIPrintf(viewer, "Information on VecScatter used in matrix-vector product: \n"));
1293       PetscCall(VecScatterView(aij->Mvctx, viewer));
1294       PetscFunctionReturn(PETSC_SUCCESS);
1295     } else if (format == PETSC_VIEWER_ASCII_INFO) {
1296       PetscInt inodecount, inodelimit, *inodes;
1297       PetscCall(MatInodeGetInodeSizes(aij->A, &inodecount, &inodes, &inodelimit));
1298       if (inodes) {
1299         PetscCall(PetscViewerASCIIPrintf(viewer, "using I-node (on process 0) routines: found %" PetscInt_FMT " nodes, limit used is %" PetscInt_FMT "\n", inodecount, inodelimit));
1300       } else {
1301         PetscCall(PetscViewerASCIIPrintf(viewer, "not using I-node (on process 0) routines\n"));
1302       }
1303       PetscFunctionReturn(PETSC_SUCCESS);
1304     } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO) {
1305       PetscFunctionReturn(PETSC_SUCCESS);
1306     }
1307   } else if (isbinary) {
1308     if (size == 1) {
1309       PetscCall(PetscObjectSetName((PetscObject)aij->A, ((PetscObject)mat)->name));
1310       PetscCall(MatView(aij->A, viewer));
1311     } else {
1312       PetscCall(MatView_MPIAIJ_Binary(mat, viewer));
1313     }
1314     PetscFunctionReturn(PETSC_SUCCESS);
1315   } else if (iascii && size == 1) {
1316     PetscCall(PetscObjectSetName((PetscObject)aij->A, ((PetscObject)mat)->name));
1317     PetscCall(MatView(aij->A, viewer));
1318     PetscFunctionReturn(PETSC_SUCCESS);
1319   } else if (isdraw) {
1320     PetscDraw draw;
1321     PetscBool isnull;
1322     PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1323     PetscCall(PetscDrawIsNull(draw, &isnull));
1324     if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1325   }
1326 
1327   { /* assemble the entire matrix onto first processor */
1328     Mat A = NULL, Av;
1329     IS  isrow, iscol;
1330 
1331     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat), rank == 0 ? mat->rmap->N : 0, 0, 1, &isrow));
1332     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat), rank == 0 ? mat->cmap->N : 0, 0, 1, &iscol));
1333     PetscCall(MatCreateSubMatrix(mat, isrow, iscol, MAT_INITIAL_MATRIX, &A));
1334     PetscCall(MatMPIAIJGetSeqAIJ(A, &Av, NULL, NULL));
1335     /*  The commented code uses MatCreateSubMatrices instead */
1336     /*
1337     Mat *AA, A = NULL, Av;
1338     IS  isrow,iscol;
1339 
1340     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat),rank == 0 ? mat->rmap->N : 0,0,1,&isrow));
1341     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat),rank == 0 ? mat->cmap->N : 0,0,1,&iscol));
1342     PetscCall(MatCreateSubMatrices(mat,1,&isrow,&iscol,MAT_INITIAL_MATRIX,&AA));
1343     if (rank == 0) {
1344        PetscCall(PetscObjectReference((PetscObject)AA[0]));
1345        A    = AA[0];
1346        Av   = AA[0];
1347     }
1348     PetscCall(MatDestroySubMatrices(1,&AA));
1349 */
1350     PetscCall(ISDestroy(&iscol));
1351     PetscCall(ISDestroy(&isrow));
1352     /*
1353        Everyone has to call to draw the matrix since the graphics waits are
1354        synchronized across all processors that share the PetscDraw object
1355     */
1356     PetscCall(PetscViewerGetSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
1357     if (rank == 0) {
1358       if (((PetscObject)mat)->name) PetscCall(PetscObjectSetName((PetscObject)Av, ((PetscObject)mat)->name));
1359       PetscCall(MatView_SeqAIJ(Av, sviewer));
1360     }
1361     PetscCall(PetscViewerRestoreSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
1362     PetscCall(MatDestroy(&A));
1363   }
1364   PetscFunctionReturn(PETSC_SUCCESS);
1365 }
1366 
1367 PetscErrorCode MatView_MPIAIJ(Mat mat, PetscViewer viewer)
1368 {
1369   PetscBool iascii, isdraw, issocket, isbinary;
1370 
1371   PetscFunctionBegin;
1372   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1373   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1374   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1375   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERSOCKET, &issocket));
1376   if (iascii || isdraw || isbinary || issocket) PetscCall(MatView_MPIAIJ_ASCIIorDraworSocket(mat, viewer));
1377   PetscFunctionReturn(PETSC_SUCCESS);
1378 }
1379 
1380 static PetscErrorCode MatSOR_MPIAIJ(Mat matin, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1381 {
1382   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)matin->data;
1383   Vec         bb1 = NULL;
1384   PetscBool   hasop;
1385 
1386   PetscFunctionBegin;
1387   if (flag == SOR_APPLY_UPPER) {
1388     PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1389     PetscFunctionReturn(PETSC_SUCCESS);
1390   }
1391 
1392   if (its > 1 || ~flag & SOR_ZERO_INITIAL_GUESS || flag & SOR_EISENSTAT) PetscCall(VecDuplicate(bb, &bb1));
1393 
1394   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP) {
1395     if (flag & SOR_ZERO_INITIAL_GUESS) {
1396       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1397       its--;
1398     }
1399 
1400     while (its--) {
1401       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1402       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1403 
1404       /* update rhs: bb1 = bb - B*x */
1405       PetscCall(VecScale(mat->lvec, -1.0));
1406       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1407 
1408       /* local sweep */
1409       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_SYMMETRIC_SWEEP, fshift, lits, 1, xx));
1410     }
1411   } else if (flag & SOR_LOCAL_FORWARD_SWEEP) {
1412     if (flag & SOR_ZERO_INITIAL_GUESS) {
1413       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1414       its--;
1415     }
1416     while (its--) {
1417       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1418       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1419 
1420       /* update rhs: bb1 = bb - B*x */
1421       PetscCall(VecScale(mat->lvec, -1.0));
1422       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1423 
1424       /* local sweep */
1425       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_FORWARD_SWEEP, fshift, lits, 1, xx));
1426     }
1427   } else if (flag & SOR_LOCAL_BACKWARD_SWEEP) {
1428     if (flag & SOR_ZERO_INITIAL_GUESS) {
1429       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1430       its--;
1431     }
1432     while (its--) {
1433       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1434       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1435 
1436       /* update rhs: bb1 = bb - B*x */
1437       PetscCall(VecScale(mat->lvec, -1.0));
1438       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1439 
1440       /* local sweep */
1441       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_BACKWARD_SWEEP, fshift, lits, 1, xx));
1442     }
1443   } else if (flag & SOR_EISENSTAT) {
1444     Vec xx1;
1445 
1446     PetscCall(VecDuplicate(bb, &xx1));
1447     PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, (MatSORType)(SOR_ZERO_INITIAL_GUESS | SOR_LOCAL_BACKWARD_SWEEP), fshift, lits, 1, xx));
1448 
1449     PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1450     PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1451     if (!mat->diag) {
1452       PetscCall(MatCreateVecs(matin, &mat->diag, NULL));
1453       PetscCall(MatGetDiagonal(matin, mat->diag));
1454     }
1455     PetscCall(MatHasOperation(matin, MATOP_MULT_DIAGONAL_BLOCK, &hasop));
1456     if (hasop) {
1457       PetscCall(MatMultDiagonalBlock(matin, xx, bb1));
1458     } else {
1459       PetscCall(VecPointwiseMult(bb1, mat->diag, xx));
1460     }
1461     PetscCall(VecAYPX(bb1, (omega - 2.0) / omega, bb));
1462 
1463     PetscCall(MatMultAdd(mat->B, mat->lvec, bb1, bb1));
1464 
1465     /* local sweep */
1466     PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, (MatSORType)(SOR_ZERO_INITIAL_GUESS | SOR_LOCAL_FORWARD_SWEEP), fshift, lits, 1, xx1));
1467     PetscCall(VecAXPY(xx, 1.0, xx1));
1468     PetscCall(VecDestroy(&xx1));
1469   } else SETERRQ(PetscObjectComm((PetscObject)matin), PETSC_ERR_SUP, "Parallel SOR not supported");
1470 
1471   PetscCall(VecDestroy(&bb1));
1472 
1473   matin->factorerrortype = mat->A->factorerrortype;
1474   PetscFunctionReturn(PETSC_SUCCESS);
1475 }
1476 
1477 static PetscErrorCode MatPermute_MPIAIJ(Mat A, IS rowp, IS colp, Mat *B)
1478 {
1479   Mat             aA, aB, Aperm;
1480   const PetscInt *rwant, *cwant, *gcols, *ai, *bi, *aj, *bj;
1481   PetscScalar    *aa, *ba;
1482   PetscInt        i, j, m, n, ng, anz, bnz, *dnnz, *onnz, *tdnnz, *tonnz, *rdest, *cdest, *work, *gcdest;
1483   PetscSF         rowsf, sf;
1484   IS              parcolp = NULL;
1485   PetscBool       done;
1486 
1487   PetscFunctionBegin;
1488   PetscCall(MatGetLocalSize(A, &m, &n));
1489   PetscCall(ISGetIndices(rowp, &rwant));
1490   PetscCall(ISGetIndices(colp, &cwant));
1491   PetscCall(PetscMalloc3(PetscMax(m, n), &work, m, &rdest, n, &cdest));
1492 
1493   /* Invert row permutation to find out where my rows should go */
1494   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &rowsf));
1495   PetscCall(PetscSFSetGraphLayout(rowsf, A->rmap, A->rmap->n, NULL, PETSC_OWN_POINTER, rwant));
1496   PetscCall(PetscSFSetFromOptions(rowsf));
1497   for (i = 0; i < m; i++) work[i] = A->rmap->rstart + i;
1498   PetscCall(PetscSFReduceBegin(rowsf, MPIU_INT, work, rdest, MPI_REPLACE));
1499   PetscCall(PetscSFReduceEnd(rowsf, MPIU_INT, work, rdest, MPI_REPLACE));
1500 
1501   /* Invert column permutation to find out where my columns should go */
1502   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1503   PetscCall(PetscSFSetGraphLayout(sf, A->cmap, A->cmap->n, NULL, PETSC_OWN_POINTER, cwant));
1504   PetscCall(PetscSFSetFromOptions(sf));
1505   for (i = 0; i < n; i++) work[i] = A->cmap->rstart + i;
1506   PetscCall(PetscSFReduceBegin(sf, MPIU_INT, work, cdest, MPI_REPLACE));
1507   PetscCall(PetscSFReduceEnd(sf, MPIU_INT, work, cdest, MPI_REPLACE));
1508   PetscCall(PetscSFDestroy(&sf));
1509 
1510   PetscCall(ISRestoreIndices(rowp, &rwant));
1511   PetscCall(ISRestoreIndices(colp, &cwant));
1512   PetscCall(MatMPIAIJGetSeqAIJ(A, &aA, &aB, &gcols));
1513 
1514   /* Find out where my gcols should go */
1515   PetscCall(MatGetSize(aB, NULL, &ng));
1516   PetscCall(PetscMalloc1(ng, &gcdest));
1517   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1518   PetscCall(PetscSFSetGraphLayout(sf, A->cmap, ng, NULL, PETSC_OWN_POINTER, gcols));
1519   PetscCall(PetscSFSetFromOptions(sf));
1520   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, cdest, gcdest, MPI_REPLACE));
1521   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, cdest, gcdest, MPI_REPLACE));
1522   PetscCall(PetscSFDestroy(&sf));
1523 
1524   PetscCall(PetscCalloc4(m, &dnnz, m, &onnz, m, &tdnnz, m, &tonnz));
1525   PetscCall(MatGetRowIJ(aA, 0, PETSC_FALSE, PETSC_FALSE, &anz, &ai, &aj, &done));
1526   PetscCall(MatGetRowIJ(aB, 0, PETSC_FALSE, PETSC_FALSE, &bnz, &bi, &bj, &done));
1527   for (i = 0; i < m; i++) {
1528     PetscInt    row = rdest[i];
1529     PetscMPIInt rowner;
1530     PetscCall(PetscLayoutFindOwner(A->rmap, row, &rowner));
1531     for (j = ai[i]; j < ai[i + 1]; j++) {
1532       PetscInt    col = cdest[aj[j]];
1533       PetscMPIInt cowner;
1534       PetscCall(PetscLayoutFindOwner(A->cmap, col, &cowner)); /* Could build an index for the columns to eliminate this search */
1535       if (rowner == cowner) dnnz[i]++;
1536       else onnz[i]++;
1537     }
1538     for (j = bi[i]; j < bi[i + 1]; j++) {
1539       PetscInt    col = gcdest[bj[j]];
1540       PetscMPIInt cowner;
1541       PetscCall(PetscLayoutFindOwner(A->cmap, col, &cowner));
1542       if (rowner == cowner) dnnz[i]++;
1543       else onnz[i]++;
1544     }
1545   }
1546   PetscCall(PetscSFBcastBegin(rowsf, MPIU_INT, dnnz, tdnnz, MPI_REPLACE));
1547   PetscCall(PetscSFBcastEnd(rowsf, MPIU_INT, dnnz, tdnnz, MPI_REPLACE));
1548   PetscCall(PetscSFBcastBegin(rowsf, MPIU_INT, onnz, tonnz, MPI_REPLACE));
1549   PetscCall(PetscSFBcastEnd(rowsf, MPIU_INT, onnz, tonnz, MPI_REPLACE));
1550   PetscCall(PetscSFDestroy(&rowsf));
1551 
1552   PetscCall(MatCreateAIJ(PetscObjectComm((PetscObject)A), A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N, 0, tdnnz, 0, tonnz, &Aperm));
1553   PetscCall(MatSeqAIJGetArray(aA, &aa));
1554   PetscCall(MatSeqAIJGetArray(aB, &ba));
1555   for (i = 0; i < m; i++) {
1556     PetscInt *acols = dnnz, *bcols = onnz; /* Repurpose now-unneeded arrays */
1557     PetscInt  j0, rowlen;
1558     rowlen = ai[i + 1] - ai[i];
1559     for (j0 = j = 0; j < rowlen; j0 = j) { /* rowlen could be larger than number of rows m, so sum in batches */
1560       for (; j < PetscMin(rowlen, j0 + m); j++) acols[j - j0] = cdest[aj[ai[i] + j]];
1561       PetscCall(MatSetValues(Aperm, 1, &rdest[i], j - j0, acols, aa + ai[i] + j0, INSERT_VALUES));
1562     }
1563     rowlen = bi[i + 1] - bi[i];
1564     for (j0 = j = 0; j < rowlen; j0 = j) {
1565       for (; j < PetscMin(rowlen, j0 + m); j++) bcols[j - j0] = gcdest[bj[bi[i] + j]];
1566       PetscCall(MatSetValues(Aperm, 1, &rdest[i], j - j0, bcols, ba + bi[i] + j0, INSERT_VALUES));
1567     }
1568   }
1569   PetscCall(MatAssemblyBegin(Aperm, MAT_FINAL_ASSEMBLY));
1570   PetscCall(MatAssemblyEnd(Aperm, MAT_FINAL_ASSEMBLY));
1571   PetscCall(MatRestoreRowIJ(aA, 0, PETSC_FALSE, PETSC_FALSE, &anz, &ai, &aj, &done));
1572   PetscCall(MatRestoreRowIJ(aB, 0, PETSC_FALSE, PETSC_FALSE, &bnz, &bi, &bj, &done));
1573   PetscCall(MatSeqAIJRestoreArray(aA, &aa));
1574   PetscCall(MatSeqAIJRestoreArray(aB, &ba));
1575   PetscCall(PetscFree4(dnnz, onnz, tdnnz, tonnz));
1576   PetscCall(PetscFree3(work, rdest, cdest));
1577   PetscCall(PetscFree(gcdest));
1578   if (parcolp) PetscCall(ISDestroy(&colp));
1579   *B = Aperm;
1580   PetscFunctionReturn(PETSC_SUCCESS);
1581 }
1582 
1583 static PetscErrorCode MatGetGhosts_MPIAIJ(Mat mat, PetscInt *nghosts, const PetscInt *ghosts[])
1584 {
1585   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1586 
1587   PetscFunctionBegin;
1588   PetscCall(MatGetSize(aij->B, NULL, nghosts));
1589   if (ghosts) *ghosts = aij->garray;
1590   PetscFunctionReturn(PETSC_SUCCESS);
1591 }
1592 
1593 static PetscErrorCode MatGetInfo_MPIAIJ(Mat matin, MatInfoType flag, MatInfo *info)
1594 {
1595   Mat_MPIAIJ    *mat = (Mat_MPIAIJ *)matin->data;
1596   Mat            A = mat->A, B = mat->B;
1597   PetscLogDouble isend[5], irecv[5];
1598 
1599   PetscFunctionBegin;
1600   info->block_size = 1.0;
1601   PetscCall(MatGetInfo(A, MAT_LOCAL, info));
1602 
1603   isend[0] = info->nz_used;
1604   isend[1] = info->nz_allocated;
1605   isend[2] = info->nz_unneeded;
1606   isend[3] = info->memory;
1607   isend[4] = info->mallocs;
1608 
1609   PetscCall(MatGetInfo(B, MAT_LOCAL, info));
1610 
1611   isend[0] += info->nz_used;
1612   isend[1] += info->nz_allocated;
1613   isend[2] += info->nz_unneeded;
1614   isend[3] += info->memory;
1615   isend[4] += info->mallocs;
1616   if (flag == MAT_LOCAL) {
1617     info->nz_used      = isend[0];
1618     info->nz_allocated = isend[1];
1619     info->nz_unneeded  = isend[2];
1620     info->memory       = isend[3];
1621     info->mallocs      = isend[4];
1622   } else if (flag == MAT_GLOBAL_MAX) {
1623     PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_MAX, PetscObjectComm((PetscObject)matin)));
1624 
1625     info->nz_used      = irecv[0];
1626     info->nz_allocated = irecv[1];
1627     info->nz_unneeded  = irecv[2];
1628     info->memory       = irecv[3];
1629     info->mallocs      = irecv[4];
1630   } else if (flag == MAT_GLOBAL_SUM) {
1631     PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_SUM, PetscObjectComm((PetscObject)matin)));
1632 
1633     info->nz_used      = irecv[0];
1634     info->nz_allocated = irecv[1];
1635     info->nz_unneeded  = irecv[2];
1636     info->memory       = irecv[3];
1637     info->mallocs      = irecv[4];
1638   }
1639   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1640   info->fill_ratio_needed = 0;
1641   info->factor_mallocs    = 0;
1642   PetscFunctionReturn(PETSC_SUCCESS);
1643 }
1644 
1645 PetscErrorCode MatSetOption_MPIAIJ(Mat A, MatOption op, PetscBool flg)
1646 {
1647   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1648 
1649   PetscFunctionBegin;
1650   switch (op) {
1651   case MAT_NEW_NONZERO_LOCATIONS:
1652   case MAT_NEW_NONZERO_ALLOCATION_ERR:
1653   case MAT_UNUSED_NONZERO_LOCATION_ERR:
1654   case MAT_KEEP_NONZERO_PATTERN:
1655   case MAT_NEW_NONZERO_LOCATION_ERR:
1656   case MAT_USE_INODES:
1657   case MAT_IGNORE_ZERO_ENTRIES:
1658   case MAT_FORM_EXPLICIT_TRANSPOSE:
1659     MatCheckPreallocated(A, 1);
1660     PetscCall(MatSetOption(a->A, op, flg));
1661     PetscCall(MatSetOption(a->B, op, flg));
1662     break;
1663   case MAT_ROW_ORIENTED:
1664     MatCheckPreallocated(A, 1);
1665     a->roworiented = flg;
1666 
1667     PetscCall(MatSetOption(a->A, op, flg));
1668     PetscCall(MatSetOption(a->B, op, flg));
1669     break;
1670   case MAT_FORCE_DIAGONAL_ENTRIES:
1671   case MAT_SORTED_FULL:
1672     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
1673     break;
1674   case MAT_IGNORE_OFF_PROC_ENTRIES:
1675     a->donotstash = flg;
1676     break;
1677   /* Symmetry flags are handled directly by MatSetOption() and they don't affect preallocation */
1678   case MAT_SPD:
1679   case MAT_SYMMETRIC:
1680   case MAT_STRUCTURALLY_SYMMETRIC:
1681   case MAT_HERMITIAN:
1682   case MAT_SYMMETRY_ETERNAL:
1683   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
1684   case MAT_SPD_ETERNAL:
1685     /* if the diagonal matrix is square it inherits some of the properties above */
1686     break;
1687   case MAT_SUBMAT_SINGLEIS:
1688     A->submat_singleis = flg;
1689     break;
1690   case MAT_STRUCTURE_ONLY:
1691     /* The option is handled directly by MatSetOption() */
1692     break;
1693   default:
1694     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
1695   }
1696   PetscFunctionReturn(PETSC_SUCCESS);
1697 }
1698 
1699 PetscErrorCode MatGetRow_MPIAIJ(Mat matin, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1700 {
1701   Mat_MPIAIJ  *mat = (Mat_MPIAIJ *)matin->data;
1702   PetscScalar *vworkA, *vworkB, **pvA, **pvB, *v_p;
1703   PetscInt     i, *cworkA, *cworkB, **pcA, **pcB, cstart = matin->cmap->rstart;
1704   PetscInt     nztot, nzA, nzB, lrow, rstart = matin->rmap->rstart, rend = matin->rmap->rend;
1705   PetscInt    *cmap, *idx_p;
1706 
1707   PetscFunctionBegin;
1708   PetscCheck(!mat->getrowactive, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Already active");
1709   mat->getrowactive = PETSC_TRUE;
1710 
1711   if (!mat->rowvalues && (idx || v)) {
1712     /*
1713         allocate enough space to hold information from the longest row.
1714     */
1715     Mat_SeqAIJ *Aa = (Mat_SeqAIJ *)mat->A->data, *Ba = (Mat_SeqAIJ *)mat->B->data;
1716     PetscInt    max = 1, tmp;
1717     for (i = 0; i < matin->rmap->n; i++) {
1718       tmp = Aa->i[i + 1] - Aa->i[i] + Ba->i[i + 1] - Ba->i[i];
1719       if (max < tmp) max = tmp;
1720     }
1721     PetscCall(PetscMalloc2(max, &mat->rowvalues, max, &mat->rowindices));
1722   }
1723 
1724   PetscCheck(row >= rstart && row < rend, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Only local rows");
1725   lrow = row - rstart;
1726 
1727   pvA = &vworkA;
1728   pcA = &cworkA;
1729   pvB = &vworkB;
1730   pcB = &cworkB;
1731   if (!v) {
1732     pvA = NULL;
1733     pvB = NULL;
1734   }
1735   if (!idx) {
1736     pcA = NULL;
1737     if (!v) pcB = NULL;
1738   }
1739   PetscCall((*mat->A->ops->getrow)(mat->A, lrow, &nzA, pcA, pvA));
1740   PetscCall((*mat->B->ops->getrow)(mat->B, lrow, &nzB, pcB, pvB));
1741   nztot = nzA + nzB;
1742 
1743   cmap = mat->garray;
1744   if (v || idx) {
1745     if (nztot) {
1746       /* Sort by increasing column numbers, assuming A and B already sorted */
1747       PetscInt imark = -1;
1748       if (v) {
1749         *v = v_p = mat->rowvalues;
1750         for (i = 0; i < nzB; i++) {
1751           if (cmap[cworkB[i]] < cstart) v_p[i] = vworkB[i];
1752           else break;
1753         }
1754         imark = i;
1755         for (i = 0; i < nzA; i++) v_p[imark + i] = vworkA[i];
1756         for (i = imark; i < nzB; i++) v_p[nzA + i] = vworkB[i];
1757       }
1758       if (idx) {
1759         *idx = idx_p = mat->rowindices;
1760         if (imark > -1) {
1761           for (i = 0; i < imark; i++) idx_p[i] = cmap[cworkB[i]];
1762         } else {
1763           for (i = 0; i < nzB; i++) {
1764             if (cmap[cworkB[i]] < cstart) idx_p[i] = cmap[cworkB[i]];
1765             else break;
1766           }
1767           imark = i;
1768         }
1769         for (i = 0; i < nzA; i++) idx_p[imark + i] = cstart + cworkA[i];
1770         for (i = imark; i < nzB; i++) idx_p[nzA + i] = cmap[cworkB[i]];
1771       }
1772     } else {
1773       if (idx) *idx = NULL;
1774       if (v) *v = NULL;
1775     }
1776   }
1777   *nz = nztot;
1778   PetscCall((*mat->A->ops->restorerow)(mat->A, lrow, &nzA, pcA, pvA));
1779   PetscCall((*mat->B->ops->restorerow)(mat->B, lrow, &nzB, pcB, pvB));
1780   PetscFunctionReturn(PETSC_SUCCESS);
1781 }
1782 
1783 PetscErrorCode MatRestoreRow_MPIAIJ(Mat mat, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1784 {
1785   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1786 
1787   PetscFunctionBegin;
1788   PetscCheck(aij->getrowactive, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "MatGetRow() must be called first");
1789   aij->getrowactive = PETSC_FALSE;
1790   PetscFunctionReturn(PETSC_SUCCESS);
1791 }
1792 
1793 static PetscErrorCode MatNorm_MPIAIJ(Mat mat, NormType type, PetscReal *norm)
1794 {
1795   Mat_MPIAIJ      *aij  = (Mat_MPIAIJ *)mat->data;
1796   Mat_SeqAIJ      *amat = (Mat_SeqAIJ *)aij->A->data, *bmat = (Mat_SeqAIJ *)aij->B->data;
1797   PetscInt         i, j, cstart = mat->cmap->rstart;
1798   PetscReal        sum = 0.0;
1799   const MatScalar *v, *amata, *bmata;
1800   PetscMPIInt      iN;
1801 
1802   PetscFunctionBegin;
1803   if (aij->size == 1) {
1804     PetscCall(MatNorm(aij->A, type, norm));
1805   } else {
1806     PetscCall(MatSeqAIJGetArrayRead(aij->A, &amata));
1807     PetscCall(MatSeqAIJGetArrayRead(aij->B, &bmata));
1808     if (type == NORM_FROBENIUS) {
1809       v = amata;
1810       for (i = 0; i < amat->nz; i++) {
1811         sum += PetscRealPart(PetscConj(*v) * (*v));
1812         v++;
1813       }
1814       v = bmata;
1815       for (i = 0; i < bmat->nz; i++) {
1816         sum += PetscRealPart(PetscConj(*v) * (*v));
1817         v++;
1818       }
1819       PetscCallMPI(MPIU_Allreduce(&sum, norm, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)mat)));
1820       *norm = PetscSqrtReal(*norm);
1821       PetscCall(PetscLogFlops(2.0 * amat->nz + 2.0 * bmat->nz));
1822     } else if (type == NORM_1) { /* max column norm */
1823       PetscReal *tmp, *tmp2;
1824       PetscInt  *jj, *garray = aij->garray;
1825       PetscCall(PetscCalloc1(mat->cmap->N + 1, &tmp));
1826       PetscCall(PetscMalloc1(mat->cmap->N + 1, &tmp2));
1827       *norm = 0.0;
1828       v     = amata;
1829       jj    = amat->j;
1830       for (j = 0; j < amat->nz; j++) {
1831         tmp[cstart + *jj++] += PetscAbsScalar(*v);
1832         v++;
1833       }
1834       v  = bmata;
1835       jj = bmat->j;
1836       for (j = 0; j < bmat->nz; j++) {
1837         tmp[garray[*jj++]] += PetscAbsScalar(*v);
1838         v++;
1839       }
1840       PetscCall(PetscMPIIntCast(mat->cmap->N, &iN));
1841       PetscCallMPI(MPIU_Allreduce(tmp, tmp2, iN, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)mat)));
1842       for (j = 0; j < mat->cmap->N; j++) {
1843         if (tmp2[j] > *norm) *norm = tmp2[j];
1844       }
1845       PetscCall(PetscFree(tmp));
1846       PetscCall(PetscFree(tmp2));
1847       PetscCall(PetscLogFlops(PetscMax(amat->nz + bmat->nz - 1, 0)));
1848     } else if (type == NORM_INFINITY) { /* max row norm */
1849       PetscReal ntemp = 0.0;
1850       for (j = 0; j < aij->A->rmap->n; j++) {
1851         v   = PetscSafePointerPlusOffset(amata, amat->i[j]);
1852         sum = 0.0;
1853         for (i = 0; i < amat->i[j + 1] - amat->i[j]; i++) {
1854           sum += PetscAbsScalar(*v);
1855           v++;
1856         }
1857         v = PetscSafePointerPlusOffset(bmata, bmat->i[j]);
1858         for (i = 0; i < bmat->i[j + 1] - bmat->i[j]; i++) {
1859           sum += PetscAbsScalar(*v);
1860           v++;
1861         }
1862         if (sum > ntemp) ntemp = sum;
1863       }
1864       PetscCallMPI(MPIU_Allreduce(&ntemp, norm, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)mat)));
1865       PetscCall(PetscLogFlops(PetscMax(amat->nz + bmat->nz - 1, 0)));
1866     } else SETERRQ(PetscObjectComm((PetscObject)mat), PETSC_ERR_SUP, "No support for two norm");
1867     PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &amata));
1868     PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &bmata));
1869   }
1870   PetscFunctionReturn(PETSC_SUCCESS);
1871 }
1872 
1873 static PetscErrorCode MatTranspose_MPIAIJ(Mat A, MatReuse reuse, Mat *matout)
1874 {
1875   Mat_MPIAIJ      *a    = (Mat_MPIAIJ *)A->data, *b;
1876   Mat_SeqAIJ      *Aloc = (Mat_SeqAIJ *)a->A->data, *Bloc = (Mat_SeqAIJ *)a->B->data, *sub_B_diag;
1877   PetscInt         M = A->rmap->N, N = A->cmap->N, ma, na, mb, nb, row, *cols, *cols_tmp, *B_diag_ilen, i, ncol, A_diag_ncol;
1878   const PetscInt  *ai, *aj, *bi, *bj, *B_diag_i;
1879   Mat              B, A_diag, *B_diag;
1880   const MatScalar *pbv, *bv;
1881 
1882   PetscFunctionBegin;
1883   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *matout));
1884   ma = A->rmap->n;
1885   na = A->cmap->n;
1886   mb = a->B->rmap->n;
1887   nb = a->B->cmap->n;
1888   ai = Aloc->i;
1889   aj = Aloc->j;
1890   bi = Bloc->i;
1891   bj = Bloc->j;
1892   if (reuse == MAT_INITIAL_MATRIX || *matout == A) {
1893     PetscInt            *d_nnz, *g_nnz, *o_nnz;
1894     PetscSFNode         *oloc;
1895     PETSC_UNUSED PetscSF sf;
1896 
1897     PetscCall(PetscMalloc4(na, &d_nnz, na, &o_nnz, nb, &g_nnz, nb, &oloc));
1898     /* compute d_nnz for preallocation */
1899     PetscCall(PetscArrayzero(d_nnz, na));
1900     for (i = 0; i < ai[ma]; i++) d_nnz[aj[i]]++;
1901     /* compute local off-diagonal contributions */
1902     PetscCall(PetscArrayzero(g_nnz, nb));
1903     for (i = 0; i < bi[ma]; i++) g_nnz[bj[i]]++;
1904     /* map those to global */
1905     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1906     PetscCall(PetscSFSetGraphLayout(sf, A->cmap, nb, NULL, PETSC_USE_POINTER, a->garray));
1907     PetscCall(PetscSFSetFromOptions(sf));
1908     PetscCall(PetscArrayzero(o_nnz, na));
1909     PetscCall(PetscSFReduceBegin(sf, MPIU_INT, g_nnz, o_nnz, MPI_SUM));
1910     PetscCall(PetscSFReduceEnd(sf, MPIU_INT, g_nnz, o_nnz, MPI_SUM));
1911     PetscCall(PetscSFDestroy(&sf));
1912 
1913     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
1914     PetscCall(MatSetSizes(B, A->cmap->n, A->rmap->n, N, M));
1915     PetscCall(MatSetBlockSizes(B, PetscAbs(A->cmap->bs), PetscAbs(A->rmap->bs)));
1916     PetscCall(MatSetType(B, ((PetscObject)A)->type_name));
1917     PetscCall(MatMPIAIJSetPreallocation(B, 0, d_nnz, 0, o_nnz));
1918     PetscCall(PetscFree4(d_nnz, o_nnz, g_nnz, oloc));
1919   } else {
1920     B = *matout;
1921     PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
1922   }
1923 
1924   b           = (Mat_MPIAIJ *)B->data;
1925   A_diag      = a->A;
1926   B_diag      = &b->A;
1927   sub_B_diag  = (Mat_SeqAIJ *)(*B_diag)->data;
1928   A_diag_ncol = A_diag->cmap->N;
1929   B_diag_ilen = sub_B_diag->ilen;
1930   B_diag_i    = sub_B_diag->i;
1931 
1932   /* Set ilen for diagonal of B */
1933   for (i = 0; i < A_diag_ncol; i++) B_diag_ilen[i] = B_diag_i[i + 1] - B_diag_i[i];
1934 
1935   /* Transpose the diagonal part of the matrix. In contrast to the off-diagonal part, this can be done
1936   very quickly (=without using MatSetValues), because all writes are local. */
1937   PetscCall(MatTransposeSetPrecursor(A_diag, *B_diag));
1938   PetscCall(MatTranspose(A_diag, MAT_REUSE_MATRIX, B_diag));
1939 
1940   /* copy over the B part */
1941   PetscCall(PetscMalloc1(bi[mb], &cols));
1942   PetscCall(MatSeqAIJGetArrayRead(a->B, &bv));
1943   pbv = bv;
1944   row = A->rmap->rstart;
1945   for (i = 0; i < bi[mb]; i++) cols[i] = a->garray[bj[i]];
1946   cols_tmp = cols;
1947   for (i = 0; i < mb; i++) {
1948     ncol = bi[i + 1] - bi[i];
1949     PetscCall(MatSetValues(B, ncol, cols_tmp, 1, &row, pbv, INSERT_VALUES));
1950     row++;
1951     if (pbv) pbv += ncol;
1952     if (cols_tmp) cols_tmp += ncol;
1953   }
1954   PetscCall(PetscFree(cols));
1955   PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bv));
1956 
1957   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1958   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1959   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) {
1960     *matout = B;
1961   } else {
1962     PetscCall(MatHeaderMerge(A, &B));
1963   }
1964   PetscFunctionReturn(PETSC_SUCCESS);
1965 }
1966 
1967 static PetscErrorCode MatDiagonalScale_MPIAIJ(Mat mat, Vec ll, Vec rr)
1968 {
1969   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1970   Mat         a = aij->A, b = aij->B;
1971   PetscInt    s1, s2, s3;
1972 
1973   PetscFunctionBegin;
1974   PetscCall(MatGetLocalSize(mat, &s2, &s3));
1975   if (rr) {
1976     PetscCall(VecGetLocalSize(rr, &s1));
1977     PetscCheck(s1 == s3, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "right vector non-conforming local size");
1978     /* Overlap communication with computation. */
1979     PetscCall(VecScatterBegin(aij->Mvctx, rr, aij->lvec, INSERT_VALUES, SCATTER_FORWARD));
1980   }
1981   if (ll) {
1982     PetscCall(VecGetLocalSize(ll, &s1));
1983     PetscCheck(s1 == s2, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "left vector non-conforming local size");
1984     PetscUseTypeMethod(b, diagonalscale, ll, NULL);
1985   }
1986   /* scale  the diagonal block */
1987   PetscUseTypeMethod(a, diagonalscale, ll, rr);
1988 
1989   if (rr) {
1990     /* Do a scatter end and then right scale the off-diagonal block */
1991     PetscCall(VecScatterEnd(aij->Mvctx, rr, aij->lvec, INSERT_VALUES, SCATTER_FORWARD));
1992     PetscUseTypeMethod(b, diagonalscale, NULL, aij->lvec);
1993   }
1994   PetscFunctionReturn(PETSC_SUCCESS);
1995 }
1996 
1997 static PetscErrorCode MatSetUnfactored_MPIAIJ(Mat A)
1998 {
1999   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2000 
2001   PetscFunctionBegin;
2002   PetscCall(MatSetUnfactored(a->A));
2003   PetscFunctionReturn(PETSC_SUCCESS);
2004 }
2005 
2006 static PetscErrorCode MatEqual_MPIAIJ(Mat A, Mat B, PetscBool *flag)
2007 {
2008   Mat_MPIAIJ *matB = (Mat_MPIAIJ *)B->data, *matA = (Mat_MPIAIJ *)A->data;
2009   Mat         a, b, c, d;
2010   PetscBool   flg;
2011 
2012   PetscFunctionBegin;
2013   a = matA->A;
2014   b = matA->B;
2015   c = matB->A;
2016   d = matB->B;
2017 
2018   PetscCall(MatEqual(a, c, &flg));
2019   if (flg) PetscCall(MatEqual(b, d, &flg));
2020   PetscCallMPI(MPIU_Allreduce(&flg, flag, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)A)));
2021   PetscFunctionReturn(PETSC_SUCCESS);
2022 }
2023 
2024 static PetscErrorCode MatCopy_MPIAIJ(Mat A, Mat B, MatStructure str)
2025 {
2026   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2027   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2028 
2029   PetscFunctionBegin;
2030   /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
2031   if ((str != SAME_NONZERO_PATTERN) || (A->ops->copy != B->ops->copy)) {
2032     /* because of the column compression in the off-processor part of the matrix a->B,
2033        the number of columns in a->B and b->B may be different, hence we cannot call
2034        the MatCopy() directly on the two parts. If need be, we can provide a more
2035        efficient copy than the MatCopy_Basic() by first uncompressing the a->B matrices
2036        then copying the submatrices */
2037     PetscCall(MatCopy_Basic(A, B, str));
2038   } else {
2039     PetscCall(MatCopy(a->A, b->A, str));
2040     PetscCall(MatCopy(a->B, b->B, str));
2041   }
2042   PetscCall(PetscObjectStateIncrease((PetscObject)B));
2043   PetscFunctionReturn(PETSC_SUCCESS);
2044 }
2045 
2046 /*
2047    Computes the number of nonzeros per row needed for preallocation when X and Y
2048    have different nonzero structure.
2049 */
2050 PetscErrorCode MatAXPYGetPreallocation_MPIX_private(PetscInt m, const PetscInt *xi, const PetscInt *xj, const PetscInt *xltog, const PetscInt *yi, const PetscInt *yj, const PetscInt *yltog, PetscInt *nnz)
2051 {
2052   PetscInt i, j, k, nzx, nzy;
2053 
2054   PetscFunctionBegin;
2055   /* Set the number of nonzeros in the new matrix */
2056   for (i = 0; i < m; i++) {
2057     const PetscInt *xjj = PetscSafePointerPlusOffset(xj, xi[i]), *yjj = PetscSafePointerPlusOffset(yj, yi[i]);
2058     nzx    = xi[i + 1] - xi[i];
2059     nzy    = yi[i + 1] - yi[i];
2060     nnz[i] = 0;
2061     for (j = 0, k = 0; j < nzx; j++) {                                /* Point in X */
2062       for (; k < nzy && yltog[yjj[k]] < xltog[xjj[j]]; k++) nnz[i]++; /* Catch up to X */
2063       if (k < nzy && yltog[yjj[k]] == xltog[xjj[j]]) k++;             /* Skip duplicate */
2064       nnz[i]++;
2065     }
2066     for (; k < nzy; k++) nnz[i]++;
2067   }
2068   PetscFunctionReturn(PETSC_SUCCESS);
2069 }
2070 
2071 /* This is the same as MatAXPYGetPreallocation_SeqAIJ, except that the local-to-global map is provided */
2072 static PetscErrorCode MatAXPYGetPreallocation_MPIAIJ(Mat Y, const PetscInt *yltog, Mat X, const PetscInt *xltog, PetscInt *nnz)
2073 {
2074   PetscInt    m = Y->rmap->N;
2075   Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data;
2076   Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
2077 
2078   PetscFunctionBegin;
2079   PetscCall(MatAXPYGetPreallocation_MPIX_private(m, x->i, x->j, xltog, y->i, y->j, yltog, nnz));
2080   PetscFunctionReturn(PETSC_SUCCESS);
2081 }
2082 
2083 static PetscErrorCode MatAXPY_MPIAIJ(Mat Y, PetscScalar a, Mat X, MatStructure str)
2084 {
2085   Mat_MPIAIJ *xx = (Mat_MPIAIJ *)X->data, *yy = (Mat_MPIAIJ *)Y->data;
2086 
2087   PetscFunctionBegin;
2088   if (str == SAME_NONZERO_PATTERN) {
2089     PetscCall(MatAXPY(yy->A, a, xx->A, str));
2090     PetscCall(MatAXPY(yy->B, a, xx->B, str));
2091   } else if (str == SUBSET_NONZERO_PATTERN) { /* nonzeros of X is a subset of Y's */
2092     PetscCall(MatAXPY_Basic(Y, a, X, str));
2093   } else {
2094     Mat       B;
2095     PetscInt *nnz_d, *nnz_o;
2096 
2097     PetscCall(PetscMalloc1(yy->A->rmap->N, &nnz_d));
2098     PetscCall(PetscMalloc1(yy->B->rmap->N, &nnz_o));
2099     PetscCall(MatCreate(PetscObjectComm((PetscObject)Y), &B));
2100     PetscCall(PetscObjectSetName((PetscObject)B, ((PetscObject)Y)->name));
2101     PetscCall(MatSetLayouts(B, Y->rmap, Y->cmap));
2102     PetscCall(MatSetType(B, ((PetscObject)Y)->type_name));
2103     PetscCall(MatAXPYGetPreallocation_SeqAIJ(yy->A, xx->A, nnz_d));
2104     PetscCall(MatAXPYGetPreallocation_MPIAIJ(yy->B, yy->garray, xx->B, xx->garray, nnz_o));
2105     PetscCall(MatMPIAIJSetPreallocation(B, 0, nnz_d, 0, nnz_o));
2106     PetscCall(MatAXPY_BasicWithPreallocation(B, Y, a, X, str));
2107     PetscCall(MatHeaderMerge(Y, &B));
2108     PetscCall(PetscFree(nnz_d));
2109     PetscCall(PetscFree(nnz_o));
2110   }
2111   PetscFunctionReturn(PETSC_SUCCESS);
2112 }
2113 
2114 PETSC_INTERN PetscErrorCode MatConjugate_SeqAIJ(Mat);
2115 
2116 static PetscErrorCode MatConjugate_MPIAIJ(Mat mat)
2117 {
2118   PetscFunctionBegin;
2119   if (PetscDefined(USE_COMPLEX)) {
2120     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2121 
2122     PetscCall(MatConjugate_SeqAIJ(aij->A));
2123     PetscCall(MatConjugate_SeqAIJ(aij->B));
2124   }
2125   PetscFunctionReturn(PETSC_SUCCESS);
2126 }
2127 
2128 static PetscErrorCode MatRealPart_MPIAIJ(Mat A)
2129 {
2130   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2131 
2132   PetscFunctionBegin;
2133   PetscCall(MatRealPart(a->A));
2134   PetscCall(MatRealPart(a->B));
2135   PetscFunctionReturn(PETSC_SUCCESS);
2136 }
2137 
2138 static PetscErrorCode MatImaginaryPart_MPIAIJ(Mat A)
2139 {
2140   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2141 
2142   PetscFunctionBegin;
2143   PetscCall(MatImaginaryPart(a->A));
2144   PetscCall(MatImaginaryPart(a->B));
2145   PetscFunctionReturn(PETSC_SUCCESS);
2146 }
2147 
2148 static PetscErrorCode MatGetRowMaxAbs_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2149 {
2150   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data;
2151   PetscInt           i, *idxb = NULL, m = A->rmap->n;
2152   PetscScalar       *va, *vv;
2153   Vec                vB, vA;
2154   const PetscScalar *vb;
2155 
2156   PetscFunctionBegin;
2157   PetscCall(MatCreateVecs(a->A, NULL, &vA));
2158   PetscCall(MatGetRowMaxAbs(a->A, vA, idx));
2159 
2160   PetscCall(VecGetArrayWrite(vA, &va));
2161   if (idx) {
2162     for (i = 0; i < m; i++) {
2163       if (PetscAbsScalar(va[i])) idx[i] += A->cmap->rstart;
2164     }
2165   }
2166 
2167   PetscCall(MatCreateVecs(a->B, NULL, &vB));
2168   PetscCall(PetscMalloc1(m, &idxb));
2169   PetscCall(MatGetRowMaxAbs(a->B, vB, idxb));
2170 
2171   PetscCall(VecGetArrayWrite(v, &vv));
2172   PetscCall(VecGetArrayRead(vB, &vb));
2173   for (i = 0; i < m; i++) {
2174     if (PetscAbsScalar(va[i]) < PetscAbsScalar(vb[i])) {
2175       vv[i] = vb[i];
2176       if (idx) idx[i] = a->garray[idxb[i]];
2177     } else {
2178       vv[i] = va[i];
2179       if (idx && PetscAbsScalar(va[i]) == PetscAbsScalar(vb[i]) && idxb[i] != -1 && idx[i] > a->garray[idxb[i]]) idx[i] = a->garray[idxb[i]];
2180     }
2181   }
2182   PetscCall(VecRestoreArrayWrite(vA, &vv));
2183   PetscCall(VecRestoreArrayWrite(vA, &va));
2184   PetscCall(VecRestoreArrayRead(vB, &vb));
2185   PetscCall(PetscFree(idxb));
2186   PetscCall(VecDestroy(&vA));
2187   PetscCall(VecDestroy(&vB));
2188   PetscFunctionReturn(PETSC_SUCCESS);
2189 }
2190 
2191 static PetscErrorCode MatGetRowSumAbs_MPIAIJ(Mat A, Vec v)
2192 {
2193   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2194   Vec         vB, vA;
2195 
2196   PetscFunctionBegin;
2197   PetscCall(MatCreateVecs(a->A, NULL, &vA));
2198   PetscCall(MatGetRowSumAbs(a->A, vA));
2199   PetscCall(MatCreateVecs(a->B, NULL, &vB));
2200   PetscCall(MatGetRowSumAbs(a->B, vB));
2201   PetscCall(VecAXPY(vA, 1.0, vB));
2202   PetscCall(VecDestroy(&vB));
2203   PetscCall(VecCopy(vA, v));
2204   PetscCall(VecDestroy(&vA));
2205   PetscFunctionReturn(PETSC_SUCCESS);
2206 }
2207 
2208 static PetscErrorCode MatGetRowMinAbs_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2209 {
2210   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2211   PetscInt           m = A->rmap->n, n = A->cmap->n;
2212   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2213   PetscInt          *cmap = mat->garray;
2214   PetscInt          *diagIdx, *offdiagIdx;
2215   Vec                diagV, offdiagV;
2216   PetscScalar       *a, *diagA, *offdiagA;
2217   const PetscScalar *ba, *bav;
2218   PetscInt           r, j, col, ncols, *bi, *bj;
2219   Mat                B = mat->B;
2220   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2221 
2222   PetscFunctionBegin;
2223   /* When a process holds entire A and other processes have no entry */
2224   if (A->cmap->N == n) {
2225     PetscCall(VecGetArrayWrite(v, &diagA));
2226     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2227     PetscCall(MatGetRowMinAbs(mat->A, diagV, idx));
2228     PetscCall(VecDestroy(&diagV));
2229     PetscCall(VecRestoreArrayWrite(v, &diagA));
2230     PetscFunctionReturn(PETSC_SUCCESS);
2231   } else if (n == 0) {
2232     if (m) {
2233       PetscCall(VecGetArrayWrite(v, &a));
2234       for (r = 0; r < m; r++) {
2235         a[r] = 0.0;
2236         if (idx) idx[r] = -1;
2237       }
2238       PetscCall(VecRestoreArrayWrite(v, &a));
2239     }
2240     PetscFunctionReturn(PETSC_SUCCESS);
2241   }
2242 
2243   PetscCall(PetscMalloc2(m, &diagIdx, m, &offdiagIdx));
2244   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2245   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2246   PetscCall(MatGetRowMinAbs(mat->A, diagV, diagIdx));
2247 
2248   /* Get offdiagIdx[] for implicit 0.0 */
2249   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2250   ba = bav;
2251   bi = b->i;
2252   bj = b->j;
2253   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2254   for (r = 0; r < m; r++) {
2255     ncols = bi[r + 1] - bi[r];
2256     if (ncols == A->cmap->N - n) { /* Brow is dense */
2257       offdiagA[r]   = *ba;
2258       offdiagIdx[r] = cmap[0];
2259     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2260       offdiagA[r] = 0.0;
2261 
2262       /* Find first hole in the cmap */
2263       for (j = 0; j < ncols; j++) {
2264         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2265         if (col > j && j < cstart) {
2266           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2267           break;
2268         } else if (col > j + n && j >= cstart) {
2269           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2270           break;
2271         }
2272       }
2273       if (j == ncols && ncols < A->cmap->N - n) {
2274         /* a hole is outside compressed Bcols */
2275         if (ncols == 0) {
2276           if (cstart) {
2277             offdiagIdx[r] = 0;
2278           } else offdiagIdx[r] = cend;
2279         } else { /* ncols > 0 */
2280           offdiagIdx[r] = cmap[ncols - 1] + 1;
2281           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2282         }
2283       }
2284     }
2285 
2286     for (j = 0; j < ncols; j++) {
2287       if (PetscAbsScalar(offdiagA[r]) > PetscAbsScalar(*ba)) {
2288         offdiagA[r]   = *ba;
2289         offdiagIdx[r] = cmap[*bj];
2290       }
2291       ba++;
2292       bj++;
2293     }
2294   }
2295 
2296   PetscCall(VecGetArrayWrite(v, &a));
2297   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2298   for (r = 0; r < m; ++r) {
2299     if (PetscAbsScalar(diagA[r]) < PetscAbsScalar(offdiagA[r])) {
2300       a[r] = diagA[r];
2301       if (idx) idx[r] = cstart + diagIdx[r];
2302     } else if (PetscAbsScalar(diagA[r]) == PetscAbsScalar(offdiagA[r])) {
2303       a[r] = diagA[r];
2304       if (idx) {
2305         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2306           idx[r] = cstart + diagIdx[r];
2307         } else idx[r] = offdiagIdx[r];
2308       }
2309     } else {
2310       a[r] = offdiagA[r];
2311       if (idx) idx[r] = offdiagIdx[r];
2312     }
2313   }
2314   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2315   PetscCall(VecRestoreArrayWrite(v, &a));
2316   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2317   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2318   PetscCall(VecDestroy(&diagV));
2319   PetscCall(VecDestroy(&offdiagV));
2320   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2321   PetscFunctionReturn(PETSC_SUCCESS);
2322 }
2323 
2324 static PetscErrorCode MatGetRowMin_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2325 {
2326   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2327   PetscInt           m = A->rmap->n, n = A->cmap->n;
2328   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2329   PetscInt          *cmap = mat->garray;
2330   PetscInt          *diagIdx, *offdiagIdx;
2331   Vec                diagV, offdiagV;
2332   PetscScalar       *a, *diagA, *offdiagA;
2333   const PetscScalar *ba, *bav;
2334   PetscInt           r, j, col, ncols, *bi, *bj;
2335   Mat                B = mat->B;
2336   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2337 
2338   PetscFunctionBegin;
2339   /* When a process holds entire A and other processes have no entry */
2340   if (A->cmap->N == n) {
2341     PetscCall(VecGetArrayWrite(v, &diagA));
2342     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2343     PetscCall(MatGetRowMin(mat->A, diagV, idx));
2344     PetscCall(VecDestroy(&diagV));
2345     PetscCall(VecRestoreArrayWrite(v, &diagA));
2346     PetscFunctionReturn(PETSC_SUCCESS);
2347   } else if (n == 0) {
2348     if (m) {
2349       PetscCall(VecGetArrayWrite(v, &a));
2350       for (r = 0; r < m; r++) {
2351         a[r] = PETSC_MAX_REAL;
2352         if (idx) idx[r] = -1;
2353       }
2354       PetscCall(VecRestoreArrayWrite(v, &a));
2355     }
2356     PetscFunctionReturn(PETSC_SUCCESS);
2357   }
2358 
2359   PetscCall(PetscCalloc2(m, &diagIdx, m, &offdiagIdx));
2360   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2361   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2362   PetscCall(MatGetRowMin(mat->A, diagV, diagIdx));
2363 
2364   /* Get offdiagIdx[] for implicit 0.0 */
2365   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2366   ba = bav;
2367   bi = b->i;
2368   bj = b->j;
2369   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2370   for (r = 0; r < m; r++) {
2371     ncols = bi[r + 1] - bi[r];
2372     if (ncols == A->cmap->N - n) { /* Brow is dense */
2373       offdiagA[r]   = *ba;
2374       offdiagIdx[r] = cmap[0];
2375     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2376       offdiagA[r] = 0.0;
2377 
2378       /* Find first hole in the cmap */
2379       for (j = 0; j < ncols; j++) {
2380         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2381         if (col > j && j < cstart) {
2382           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2383           break;
2384         } else if (col > j + n && j >= cstart) {
2385           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2386           break;
2387         }
2388       }
2389       if (j == ncols && ncols < A->cmap->N - n) {
2390         /* a hole is outside compressed Bcols */
2391         if (ncols == 0) {
2392           if (cstart) {
2393             offdiagIdx[r] = 0;
2394           } else offdiagIdx[r] = cend;
2395         } else { /* ncols > 0 */
2396           offdiagIdx[r] = cmap[ncols - 1] + 1;
2397           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2398         }
2399       }
2400     }
2401 
2402     for (j = 0; j < ncols; j++) {
2403       if (PetscRealPart(offdiagA[r]) > PetscRealPart(*ba)) {
2404         offdiagA[r]   = *ba;
2405         offdiagIdx[r] = cmap[*bj];
2406       }
2407       ba++;
2408       bj++;
2409     }
2410   }
2411 
2412   PetscCall(VecGetArrayWrite(v, &a));
2413   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2414   for (r = 0; r < m; ++r) {
2415     if (PetscRealPart(diagA[r]) < PetscRealPart(offdiagA[r])) {
2416       a[r] = diagA[r];
2417       if (idx) idx[r] = cstart + diagIdx[r];
2418     } else if (PetscRealPart(diagA[r]) == PetscRealPart(offdiagA[r])) {
2419       a[r] = diagA[r];
2420       if (idx) {
2421         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2422           idx[r] = cstart + diagIdx[r];
2423         } else idx[r] = offdiagIdx[r];
2424       }
2425     } else {
2426       a[r] = offdiagA[r];
2427       if (idx) idx[r] = offdiagIdx[r];
2428     }
2429   }
2430   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2431   PetscCall(VecRestoreArrayWrite(v, &a));
2432   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2433   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2434   PetscCall(VecDestroy(&diagV));
2435   PetscCall(VecDestroy(&offdiagV));
2436   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2437   PetscFunctionReturn(PETSC_SUCCESS);
2438 }
2439 
2440 static PetscErrorCode MatGetRowMax_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2441 {
2442   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2443   PetscInt           m = A->rmap->n, n = A->cmap->n;
2444   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2445   PetscInt          *cmap = mat->garray;
2446   PetscInt          *diagIdx, *offdiagIdx;
2447   Vec                diagV, offdiagV;
2448   PetscScalar       *a, *diagA, *offdiagA;
2449   const PetscScalar *ba, *bav;
2450   PetscInt           r, j, col, ncols, *bi, *bj;
2451   Mat                B = mat->B;
2452   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2453 
2454   PetscFunctionBegin;
2455   /* When a process holds entire A and other processes have no entry */
2456   if (A->cmap->N == n) {
2457     PetscCall(VecGetArrayWrite(v, &diagA));
2458     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2459     PetscCall(MatGetRowMax(mat->A, diagV, idx));
2460     PetscCall(VecDestroy(&diagV));
2461     PetscCall(VecRestoreArrayWrite(v, &diagA));
2462     PetscFunctionReturn(PETSC_SUCCESS);
2463   } else if (n == 0) {
2464     if (m) {
2465       PetscCall(VecGetArrayWrite(v, &a));
2466       for (r = 0; r < m; r++) {
2467         a[r] = PETSC_MIN_REAL;
2468         if (idx) idx[r] = -1;
2469       }
2470       PetscCall(VecRestoreArrayWrite(v, &a));
2471     }
2472     PetscFunctionReturn(PETSC_SUCCESS);
2473   }
2474 
2475   PetscCall(PetscMalloc2(m, &diagIdx, m, &offdiagIdx));
2476   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2477   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2478   PetscCall(MatGetRowMax(mat->A, diagV, diagIdx));
2479 
2480   /* Get offdiagIdx[] for implicit 0.0 */
2481   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2482   ba = bav;
2483   bi = b->i;
2484   bj = b->j;
2485   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2486   for (r = 0; r < m; r++) {
2487     ncols = bi[r + 1] - bi[r];
2488     if (ncols == A->cmap->N - n) { /* Brow is dense */
2489       offdiagA[r]   = *ba;
2490       offdiagIdx[r] = cmap[0];
2491     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2492       offdiagA[r] = 0.0;
2493 
2494       /* Find first hole in the cmap */
2495       for (j = 0; j < ncols; j++) {
2496         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2497         if (col > j && j < cstart) {
2498           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2499           break;
2500         } else if (col > j + n && j >= cstart) {
2501           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2502           break;
2503         }
2504       }
2505       if (j == ncols && ncols < A->cmap->N - n) {
2506         /* a hole is outside compressed Bcols */
2507         if (ncols == 0) {
2508           if (cstart) {
2509             offdiagIdx[r] = 0;
2510           } else offdiagIdx[r] = cend;
2511         } else { /* ncols > 0 */
2512           offdiagIdx[r] = cmap[ncols - 1] + 1;
2513           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2514         }
2515       }
2516     }
2517 
2518     for (j = 0; j < ncols; j++) {
2519       if (PetscRealPart(offdiagA[r]) < PetscRealPart(*ba)) {
2520         offdiagA[r]   = *ba;
2521         offdiagIdx[r] = cmap[*bj];
2522       }
2523       ba++;
2524       bj++;
2525     }
2526   }
2527 
2528   PetscCall(VecGetArrayWrite(v, &a));
2529   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2530   for (r = 0; r < m; ++r) {
2531     if (PetscRealPart(diagA[r]) > PetscRealPart(offdiagA[r])) {
2532       a[r] = diagA[r];
2533       if (idx) idx[r] = cstart + diagIdx[r];
2534     } else if (PetscRealPart(diagA[r]) == PetscRealPart(offdiagA[r])) {
2535       a[r] = diagA[r];
2536       if (idx) {
2537         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2538           idx[r] = cstart + diagIdx[r];
2539         } else idx[r] = offdiagIdx[r];
2540       }
2541     } else {
2542       a[r] = offdiagA[r];
2543       if (idx) idx[r] = offdiagIdx[r];
2544     }
2545   }
2546   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2547   PetscCall(VecRestoreArrayWrite(v, &a));
2548   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2549   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2550   PetscCall(VecDestroy(&diagV));
2551   PetscCall(VecDestroy(&offdiagV));
2552   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2553   PetscFunctionReturn(PETSC_SUCCESS);
2554 }
2555 
2556 PetscErrorCode MatGetSeqNonzeroStructure_MPIAIJ(Mat mat, Mat *newmat)
2557 {
2558   Mat *dummy;
2559 
2560   PetscFunctionBegin;
2561   PetscCall(MatCreateSubMatrix_MPIAIJ_All(mat, MAT_DO_NOT_GET_VALUES, MAT_INITIAL_MATRIX, &dummy));
2562   *newmat = *dummy;
2563   PetscCall(PetscFree(dummy));
2564   PetscFunctionReturn(PETSC_SUCCESS);
2565 }
2566 
2567 static PetscErrorCode MatInvertBlockDiagonal_MPIAIJ(Mat A, const PetscScalar **values)
2568 {
2569   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2570 
2571   PetscFunctionBegin;
2572   PetscCall(MatInvertBlockDiagonal(a->A, values));
2573   A->factorerrortype = a->A->factorerrortype;
2574   PetscFunctionReturn(PETSC_SUCCESS);
2575 }
2576 
2577 static PetscErrorCode MatSetRandom_MPIAIJ(Mat x, PetscRandom rctx)
2578 {
2579   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)x->data;
2580 
2581   PetscFunctionBegin;
2582   PetscCheck(x->assembled || x->preallocated, PetscObjectComm((PetscObject)x), PETSC_ERR_ARG_WRONGSTATE, "MatSetRandom on an unassembled and unpreallocated MATMPIAIJ is not allowed");
2583   PetscCall(MatSetRandom(aij->A, rctx));
2584   if (x->assembled) {
2585     PetscCall(MatSetRandom(aij->B, rctx));
2586   } else {
2587     PetscCall(MatSetRandomSkipColumnRange_SeqAIJ_Private(aij->B, x->cmap->rstart, x->cmap->rend, rctx));
2588   }
2589   PetscCall(MatAssemblyBegin(x, MAT_FINAL_ASSEMBLY));
2590   PetscCall(MatAssemblyEnd(x, MAT_FINAL_ASSEMBLY));
2591   PetscFunctionReturn(PETSC_SUCCESS);
2592 }
2593 
2594 static PetscErrorCode MatMPIAIJSetUseScalableIncreaseOverlap_MPIAIJ(Mat A, PetscBool sc)
2595 {
2596   PetscFunctionBegin;
2597   if (sc) A->ops->increaseoverlap = MatIncreaseOverlap_MPIAIJ_Scalable;
2598   else A->ops->increaseoverlap = MatIncreaseOverlap_MPIAIJ;
2599   PetscFunctionReturn(PETSC_SUCCESS);
2600 }
2601 
2602 /*@
2603   MatMPIAIJGetNumberNonzeros - gets the number of nonzeros in the matrix on this MPI rank
2604 
2605   Not Collective
2606 
2607   Input Parameter:
2608 . A - the matrix
2609 
2610   Output Parameter:
2611 . nz - the number of nonzeros
2612 
2613   Level: advanced
2614 
2615 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`
2616 @*/
2617 PetscErrorCode MatMPIAIJGetNumberNonzeros(Mat A, PetscCount *nz)
2618 {
2619   Mat_MPIAIJ *maij = (Mat_MPIAIJ *)A->data;
2620   Mat_SeqAIJ *aaij = (Mat_SeqAIJ *)maij->A->data, *baij = (Mat_SeqAIJ *)maij->B->data;
2621   PetscBool   isaij;
2622 
2623   PetscFunctionBegin;
2624   PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATMPIAIJ, &isaij));
2625   PetscCheck(isaij, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
2626   *nz = aaij->i[A->rmap->n] + baij->i[A->rmap->n];
2627   PetscFunctionReturn(PETSC_SUCCESS);
2628 }
2629 
2630 /*@
2631   MatMPIAIJSetUseScalableIncreaseOverlap - Determine if the matrix uses a scalable algorithm to compute the overlap
2632 
2633   Collective
2634 
2635   Input Parameters:
2636 + A  - the matrix
2637 - sc - `PETSC_TRUE` indicates use the scalable algorithm (default is not to use the scalable algorithm)
2638 
2639   Level: advanced
2640 
2641 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`
2642 @*/
2643 PetscErrorCode MatMPIAIJSetUseScalableIncreaseOverlap(Mat A, PetscBool sc)
2644 {
2645   PetscFunctionBegin;
2646   PetscTryMethod(A, "MatMPIAIJSetUseScalableIncreaseOverlap_C", (Mat, PetscBool), (A, sc));
2647   PetscFunctionReturn(PETSC_SUCCESS);
2648 }
2649 
2650 PetscErrorCode MatSetFromOptions_MPIAIJ(Mat A, PetscOptionItems *PetscOptionsObject)
2651 {
2652   PetscBool sc = PETSC_FALSE, flg;
2653 
2654   PetscFunctionBegin;
2655   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJ options");
2656   if (A->ops->increaseoverlap == MatIncreaseOverlap_MPIAIJ_Scalable) sc = PETSC_TRUE;
2657   PetscCall(PetscOptionsBool("-mat_increase_overlap_scalable", "Use a scalable algorithm to compute the overlap", "MatIncreaseOverlap", sc, &sc, &flg));
2658   if (flg) PetscCall(MatMPIAIJSetUseScalableIncreaseOverlap(A, sc));
2659   PetscOptionsHeadEnd();
2660   PetscFunctionReturn(PETSC_SUCCESS);
2661 }
2662 
2663 static PetscErrorCode MatShift_MPIAIJ(Mat Y, PetscScalar a)
2664 {
2665   Mat_MPIAIJ *maij = (Mat_MPIAIJ *)Y->data;
2666   Mat_SeqAIJ *aij  = (Mat_SeqAIJ *)maij->A->data;
2667 
2668   PetscFunctionBegin;
2669   if (!Y->preallocated) {
2670     PetscCall(MatMPIAIJSetPreallocation(Y, 1, NULL, 0, NULL));
2671   } else if (!aij->nz) { /* It does not matter if diagonals of Y only partially lie in maij->A. We just need an estimated preallocation. */
2672     PetscInt nonew = aij->nonew;
2673     PetscCall(MatSeqAIJSetPreallocation(maij->A, 1, NULL));
2674     aij->nonew = nonew;
2675   }
2676   PetscCall(MatShift_Basic(Y, a));
2677   PetscFunctionReturn(PETSC_SUCCESS);
2678 }
2679 
2680 static PetscErrorCode MatMissingDiagonal_MPIAIJ(Mat A, PetscBool *missing, PetscInt *d)
2681 {
2682   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2683 
2684   PetscFunctionBegin;
2685   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only works for square matrices");
2686   PetscCall(MatMissingDiagonal(a->A, missing, d));
2687   if (d) {
2688     PetscInt rstart;
2689     PetscCall(MatGetOwnershipRange(A, &rstart, NULL));
2690     *d += rstart;
2691   }
2692   PetscFunctionReturn(PETSC_SUCCESS);
2693 }
2694 
2695 static PetscErrorCode MatInvertVariableBlockDiagonal_MPIAIJ(Mat A, PetscInt nblocks, const PetscInt *bsizes, PetscScalar *diag)
2696 {
2697   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2698 
2699   PetscFunctionBegin;
2700   PetscCall(MatInvertVariableBlockDiagonal(a->A, nblocks, bsizes, diag));
2701   PetscFunctionReturn(PETSC_SUCCESS);
2702 }
2703 
2704 static PetscErrorCode MatEliminateZeros_MPIAIJ(Mat A, PetscBool keep)
2705 {
2706   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2707 
2708   PetscFunctionBegin;
2709   PetscCall(MatEliminateZeros_SeqAIJ(a->A, keep));        // possibly keep zero diagonal coefficients
2710   PetscCall(MatEliminateZeros_SeqAIJ(a->B, PETSC_FALSE)); // never keep zero diagonal coefficients
2711   PetscFunctionReturn(PETSC_SUCCESS);
2712 }
2713 
2714 static struct _MatOps MatOps_Values = {MatSetValues_MPIAIJ,
2715                                        MatGetRow_MPIAIJ,
2716                                        MatRestoreRow_MPIAIJ,
2717                                        MatMult_MPIAIJ,
2718                                        /* 4*/ MatMultAdd_MPIAIJ,
2719                                        MatMultTranspose_MPIAIJ,
2720                                        MatMultTransposeAdd_MPIAIJ,
2721                                        NULL,
2722                                        NULL,
2723                                        NULL,
2724                                        /*10*/ NULL,
2725                                        NULL,
2726                                        NULL,
2727                                        MatSOR_MPIAIJ,
2728                                        MatTranspose_MPIAIJ,
2729                                        /*15*/ MatGetInfo_MPIAIJ,
2730                                        MatEqual_MPIAIJ,
2731                                        MatGetDiagonal_MPIAIJ,
2732                                        MatDiagonalScale_MPIAIJ,
2733                                        MatNorm_MPIAIJ,
2734                                        /*20*/ MatAssemblyBegin_MPIAIJ,
2735                                        MatAssemblyEnd_MPIAIJ,
2736                                        MatSetOption_MPIAIJ,
2737                                        MatZeroEntries_MPIAIJ,
2738                                        /*24*/ MatZeroRows_MPIAIJ,
2739                                        NULL,
2740                                        NULL,
2741                                        NULL,
2742                                        NULL,
2743                                        /*29*/ MatSetUp_MPI_Hash,
2744                                        NULL,
2745                                        NULL,
2746                                        MatGetDiagonalBlock_MPIAIJ,
2747                                        NULL,
2748                                        /*34*/ MatDuplicate_MPIAIJ,
2749                                        NULL,
2750                                        NULL,
2751                                        NULL,
2752                                        NULL,
2753                                        /*39*/ MatAXPY_MPIAIJ,
2754                                        MatCreateSubMatrices_MPIAIJ,
2755                                        MatIncreaseOverlap_MPIAIJ,
2756                                        MatGetValues_MPIAIJ,
2757                                        MatCopy_MPIAIJ,
2758                                        /*44*/ MatGetRowMax_MPIAIJ,
2759                                        MatScale_MPIAIJ,
2760                                        MatShift_MPIAIJ,
2761                                        MatDiagonalSet_MPIAIJ,
2762                                        MatZeroRowsColumns_MPIAIJ,
2763                                        /*49*/ MatSetRandom_MPIAIJ,
2764                                        MatGetRowIJ_MPIAIJ,
2765                                        MatRestoreRowIJ_MPIAIJ,
2766                                        NULL,
2767                                        NULL,
2768                                        /*54*/ MatFDColoringCreate_MPIXAIJ,
2769                                        NULL,
2770                                        MatSetUnfactored_MPIAIJ,
2771                                        MatPermute_MPIAIJ,
2772                                        NULL,
2773                                        /*59*/ MatCreateSubMatrix_MPIAIJ,
2774                                        MatDestroy_MPIAIJ,
2775                                        MatView_MPIAIJ,
2776                                        NULL,
2777                                        NULL,
2778                                        /*64*/ NULL,
2779                                        MatMatMatMultNumeric_MPIAIJ_MPIAIJ_MPIAIJ,
2780                                        NULL,
2781                                        NULL,
2782                                        NULL,
2783                                        /*69*/ MatGetRowMaxAbs_MPIAIJ,
2784                                        MatGetRowMinAbs_MPIAIJ,
2785                                        NULL,
2786                                        NULL,
2787                                        NULL,
2788                                        NULL,
2789                                        /*75*/ MatFDColoringApply_AIJ,
2790                                        MatSetFromOptions_MPIAIJ,
2791                                        NULL,
2792                                        NULL,
2793                                        MatFindZeroDiagonals_MPIAIJ,
2794                                        /*80*/ NULL,
2795                                        NULL,
2796                                        NULL,
2797                                        /*83*/ MatLoad_MPIAIJ,
2798                                        NULL,
2799                                        NULL,
2800                                        NULL,
2801                                        NULL,
2802                                        NULL,
2803                                        /*89*/ NULL,
2804                                        NULL,
2805                                        MatMatMultNumeric_MPIAIJ_MPIAIJ,
2806                                        NULL,
2807                                        NULL,
2808                                        /*94*/ MatPtAPNumeric_MPIAIJ_MPIAIJ,
2809                                        NULL,
2810                                        NULL,
2811                                        NULL,
2812                                        MatBindToCPU_MPIAIJ,
2813                                        /*99*/ MatProductSetFromOptions_MPIAIJ,
2814                                        NULL,
2815                                        NULL,
2816                                        MatConjugate_MPIAIJ,
2817                                        NULL,
2818                                        /*104*/ MatSetValuesRow_MPIAIJ,
2819                                        MatRealPart_MPIAIJ,
2820                                        MatImaginaryPart_MPIAIJ,
2821                                        NULL,
2822                                        NULL,
2823                                        /*109*/ NULL,
2824                                        NULL,
2825                                        MatGetRowMin_MPIAIJ,
2826                                        NULL,
2827                                        MatMissingDiagonal_MPIAIJ,
2828                                        /*114*/ MatGetSeqNonzeroStructure_MPIAIJ,
2829                                        NULL,
2830                                        MatGetGhosts_MPIAIJ,
2831                                        NULL,
2832                                        NULL,
2833                                        /*119*/ MatMultDiagonalBlock_MPIAIJ,
2834                                        NULL,
2835                                        NULL,
2836                                        NULL,
2837                                        MatGetMultiProcBlock_MPIAIJ,
2838                                        /*124*/ MatFindNonzeroRows_MPIAIJ,
2839                                        MatGetColumnReductions_MPIAIJ,
2840                                        MatInvertBlockDiagonal_MPIAIJ,
2841                                        MatInvertVariableBlockDiagonal_MPIAIJ,
2842                                        MatCreateSubMatricesMPI_MPIAIJ,
2843                                        /*129*/ NULL,
2844                                        NULL,
2845                                        NULL,
2846                                        MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ,
2847                                        NULL,
2848                                        /*134*/ NULL,
2849                                        NULL,
2850                                        NULL,
2851                                        NULL,
2852                                        NULL,
2853                                        /*139*/ MatSetBlockSizes_MPIAIJ,
2854                                        NULL,
2855                                        NULL,
2856                                        MatFDColoringSetUp_MPIXAIJ,
2857                                        MatFindOffBlockDiagonalEntries_MPIAIJ,
2858                                        MatCreateMPIMatConcatenateSeqMat_MPIAIJ,
2859                                        /*145*/ NULL,
2860                                        NULL,
2861                                        NULL,
2862                                        MatCreateGraph_Simple_AIJ,
2863                                        NULL,
2864                                        /*150*/ NULL,
2865                                        MatEliminateZeros_MPIAIJ,
2866                                        MatGetRowSumAbs_MPIAIJ,
2867                                        NULL,
2868                                        NULL,
2869                                        NULL};
2870 
2871 static PetscErrorCode MatStoreValues_MPIAIJ(Mat mat)
2872 {
2873   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2874 
2875   PetscFunctionBegin;
2876   PetscCall(MatStoreValues(aij->A));
2877   PetscCall(MatStoreValues(aij->B));
2878   PetscFunctionReturn(PETSC_SUCCESS);
2879 }
2880 
2881 static PetscErrorCode MatRetrieveValues_MPIAIJ(Mat mat)
2882 {
2883   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2884 
2885   PetscFunctionBegin;
2886   PetscCall(MatRetrieveValues(aij->A));
2887   PetscCall(MatRetrieveValues(aij->B));
2888   PetscFunctionReturn(PETSC_SUCCESS);
2889 }
2890 
2891 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJ(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
2892 {
2893   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2894   PetscMPIInt size;
2895 
2896   PetscFunctionBegin;
2897   if (B->hash_active) {
2898     B->ops[0]      = b->cops;
2899     B->hash_active = PETSC_FALSE;
2900   }
2901   PetscCall(PetscLayoutSetUp(B->rmap));
2902   PetscCall(PetscLayoutSetUp(B->cmap));
2903 
2904 #if defined(PETSC_USE_CTABLE)
2905   PetscCall(PetscHMapIDestroy(&b->colmap));
2906 #else
2907   PetscCall(PetscFree(b->colmap));
2908 #endif
2909   PetscCall(PetscFree(b->garray));
2910   PetscCall(VecDestroy(&b->lvec));
2911   PetscCall(VecScatterDestroy(&b->Mvctx));
2912 
2913   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2914 
2915   MatSeqXAIJGetOptions_Private(b->B);
2916   PetscCall(MatDestroy(&b->B));
2917   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
2918   PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
2919   PetscCall(MatSetBlockSizesFromMats(b->B, B, B));
2920   PetscCall(MatSetType(b->B, MATSEQAIJ));
2921   MatSeqXAIJRestoreOptions_Private(b->B);
2922 
2923   MatSeqXAIJGetOptions_Private(b->A);
2924   PetscCall(MatDestroy(&b->A));
2925   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
2926   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
2927   PetscCall(MatSetBlockSizesFromMats(b->A, B, B));
2928   PetscCall(MatSetType(b->A, MATSEQAIJ));
2929   MatSeqXAIJRestoreOptions_Private(b->A);
2930 
2931   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
2932   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
2933   B->preallocated  = PETSC_TRUE;
2934   B->was_assembled = PETSC_FALSE;
2935   B->assembled     = PETSC_FALSE;
2936   PetscFunctionReturn(PETSC_SUCCESS);
2937 }
2938 
2939 static PetscErrorCode MatResetPreallocation_MPIAIJ(Mat B)
2940 {
2941   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2942 
2943   PetscFunctionBegin;
2944   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
2945   PetscCall(PetscLayoutSetUp(B->rmap));
2946   PetscCall(PetscLayoutSetUp(B->cmap));
2947 
2948 #if defined(PETSC_USE_CTABLE)
2949   PetscCall(PetscHMapIDestroy(&b->colmap));
2950 #else
2951   PetscCall(PetscFree(b->colmap));
2952 #endif
2953   PetscCall(PetscFree(b->garray));
2954   PetscCall(VecDestroy(&b->lvec));
2955   PetscCall(VecScatterDestroy(&b->Mvctx));
2956 
2957   PetscCall(MatResetPreallocation(b->A));
2958   PetscCall(MatResetPreallocation(b->B));
2959   B->preallocated  = PETSC_TRUE;
2960   B->was_assembled = PETSC_FALSE;
2961   B->assembled     = PETSC_FALSE;
2962   PetscFunctionReturn(PETSC_SUCCESS);
2963 }
2964 
2965 PetscErrorCode MatDuplicate_MPIAIJ(Mat matin, MatDuplicateOption cpvalues, Mat *newmat)
2966 {
2967   Mat         mat;
2968   Mat_MPIAIJ *a, *oldmat = (Mat_MPIAIJ *)matin->data;
2969 
2970   PetscFunctionBegin;
2971   *newmat = NULL;
2972   PetscCall(MatCreate(PetscObjectComm((PetscObject)matin), &mat));
2973   PetscCall(MatSetSizes(mat, matin->rmap->n, matin->cmap->n, matin->rmap->N, matin->cmap->N));
2974   PetscCall(MatSetBlockSizesFromMats(mat, matin, matin));
2975   PetscCall(MatSetType(mat, ((PetscObject)matin)->type_name));
2976   a = (Mat_MPIAIJ *)mat->data;
2977 
2978   mat->factortype = matin->factortype;
2979   mat->assembled  = matin->assembled;
2980   mat->insertmode = NOT_SET_VALUES;
2981 
2982   a->size         = oldmat->size;
2983   a->rank         = oldmat->rank;
2984   a->donotstash   = oldmat->donotstash;
2985   a->roworiented  = oldmat->roworiented;
2986   a->rowindices   = NULL;
2987   a->rowvalues    = NULL;
2988   a->getrowactive = PETSC_FALSE;
2989 
2990   PetscCall(PetscLayoutReference(matin->rmap, &mat->rmap));
2991   PetscCall(PetscLayoutReference(matin->cmap, &mat->cmap));
2992   if (matin->hash_active) {
2993     PetscCall(MatSetUp(mat));
2994   } else {
2995     mat->preallocated = matin->preallocated;
2996     if (oldmat->colmap) {
2997 #if defined(PETSC_USE_CTABLE)
2998       PetscCall(PetscHMapIDuplicate(oldmat->colmap, &a->colmap));
2999 #else
3000       PetscCall(PetscMalloc1(mat->cmap->N, &a->colmap));
3001       PetscCall(PetscArraycpy(a->colmap, oldmat->colmap, mat->cmap->N));
3002 #endif
3003     } else a->colmap = NULL;
3004     if (oldmat->garray) {
3005       PetscInt len;
3006       len = oldmat->B->cmap->n;
3007       PetscCall(PetscMalloc1(len + 1, &a->garray));
3008       if (len) PetscCall(PetscArraycpy(a->garray, oldmat->garray, len));
3009     } else a->garray = NULL;
3010 
3011     /* It may happen MatDuplicate is called with a non-assembled matrix
3012       In fact, MatDuplicate only requires the matrix to be preallocated
3013       This may happen inside a DMCreateMatrix_Shell */
3014     if (oldmat->lvec) PetscCall(VecDuplicate(oldmat->lvec, &a->lvec));
3015     if (oldmat->Mvctx) {
3016       a->Mvctx = oldmat->Mvctx;
3017       PetscCall(PetscObjectReference((PetscObject)oldmat->Mvctx));
3018     }
3019     PetscCall(MatDuplicate(oldmat->A, cpvalues, &a->A));
3020     PetscCall(MatDuplicate(oldmat->B, cpvalues, &a->B));
3021   }
3022   PetscCall(PetscFunctionListDuplicate(((PetscObject)matin)->qlist, &((PetscObject)mat)->qlist));
3023   *newmat = mat;
3024   PetscFunctionReturn(PETSC_SUCCESS);
3025 }
3026 
3027 PetscErrorCode MatLoad_MPIAIJ(Mat newMat, PetscViewer viewer)
3028 {
3029   PetscBool isbinary, ishdf5;
3030 
3031   PetscFunctionBegin;
3032   PetscValidHeaderSpecific(newMat, MAT_CLASSID, 1);
3033   PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
3034   /* force binary viewer to load .info file if it has not yet done so */
3035   PetscCall(PetscViewerSetUp(viewer));
3036   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
3037   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERHDF5, &ishdf5));
3038   if (isbinary) {
3039     PetscCall(MatLoad_MPIAIJ_Binary(newMat, viewer));
3040   } else if (ishdf5) {
3041 #if defined(PETSC_HAVE_HDF5)
3042     PetscCall(MatLoad_AIJ_HDF5(newMat, viewer));
3043 #else
3044     SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "HDF5 not supported in this build.\nPlease reconfigure using --download-hdf5");
3045 #endif
3046   } else {
3047     SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "Viewer type %s not yet supported for reading %s matrices", ((PetscObject)viewer)->type_name, ((PetscObject)newMat)->type_name);
3048   }
3049   PetscFunctionReturn(PETSC_SUCCESS);
3050 }
3051 
3052 PetscErrorCode MatLoad_MPIAIJ_Binary(Mat mat, PetscViewer viewer)
3053 {
3054   PetscInt     header[4], M, N, m, nz, rows, cols, sum, i;
3055   PetscInt    *rowidxs, *colidxs;
3056   PetscScalar *matvals;
3057 
3058   PetscFunctionBegin;
3059   PetscCall(PetscViewerSetUp(viewer));
3060 
3061   /* read in matrix header */
3062   PetscCall(PetscViewerBinaryRead(viewer, header, 4, NULL, PETSC_INT));
3063   PetscCheck(header[0] == MAT_FILE_CLASSID, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Not a matrix object in file");
3064   M  = header[1];
3065   N  = header[2];
3066   nz = header[3];
3067   PetscCheck(M >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix row size (%" PetscInt_FMT ") in file is negative", M);
3068   PetscCheck(N >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix column size (%" PetscInt_FMT ") in file is negative", N);
3069   PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix stored in special format on disk, cannot load as MPIAIJ");
3070 
3071   /* set block sizes from the viewer's .info file */
3072   PetscCall(MatLoad_Binary_BlockSizes(mat, viewer));
3073   /* set global sizes if not set already */
3074   if (mat->rmap->N < 0) mat->rmap->N = M;
3075   if (mat->cmap->N < 0) mat->cmap->N = N;
3076   PetscCall(PetscLayoutSetUp(mat->rmap));
3077   PetscCall(PetscLayoutSetUp(mat->cmap));
3078 
3079   /* check if the matrix sizes are correct */
3080   PetscCall(MatGetSize(mat, &rows, &cols));
3081   PetscCheck(M == rows && N == cols, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix in file of different sizes (%" PetscInt_FMT ", %" PetscInt_FMT ") than the input matrix (%" PetscInt_FMT ", %" PetscInt_FMT ")", M, N, rows, cols);
3082 
3083   /* read in row lengths and build row indices */
3084   PetscCall(MatGetLocalSize(mat, &m, NULL));
3085   PetscCall(PetscMalloc1(m + 1, &rowidxs));
3086   PetscCall(PetscViewerBinaryReadAll(viewer, rowidxs + 1, m, PETSC_DECIDE, M, PETSC_INT));
3087   rowidxs[0] = 0;
3088   for (i = 0; i < m; i++) rowidxs[i + 1] += rowidxs[i];
3089   if (nz != PETSC_INT_MAX) {
3090     PetscCallMPI(MPIU_Allreduce(&rowidxs[m], &sum, 1, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)viewer)));
3091     PetscCheck(sum == nz, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Inconsistent matrix data in file: nonzeros = %" PetscInt_FMT ", sum-row-lengths = %" PetscInt_FMT, nz, sum);
3092   }
3093 
3094   /* read in column indices and matrix values */
3095   PetscCall(PetscMalloc2(rowidxs[m], &colidxs, rowidxs[m], &matvals));
3096   PetscCall(PetscViewerBinaryReadAll(viewer, colidxs, rowidxs[m], PETSC_DETERMINE, PETSC_DETERMINE, PETSC_INT));
3097   PetscCall(PetscViewerBinaryReadAll(viewer, matvals, rowidxs[m], PETSC_DETERMINE, PETSC_DETERMINE, PETSC_SCALAR));
3098   /* store matrix indices and values */
3099   PetscCall(MatMPIAIJSetPreallocationCSR(mat, rowidxs, colidxs, matvals));
3100   PetscCall(PetscFree(rowidxs));
3101   PetscCall(PetscFree2(colidxs, matvals));
3102   PetscFunctionReturn(PETSC_SUCCESS);
3103 }
3104 
3105 /* Not scalable because of ISAllGather() unless getting all columns. */
3106 static PetscErrorCode ISGetSeqIS_Private(Mat mat, IS iscol, IS *isseq)
3107 {
3108   IS          iscol_local;
3109   PetscBool   isstride;
3110   PetscMPIInt lisstride = 0, gisstride;
3111 
3112   PetscFunctionBegin;
3113   /* check if we are grabbing all columns*/
3114   PetscCall(PetscObjectTypeCompare((PetscObject)iscol, ISSTRIDE, &isstride));
3115 
3116   if (isstride) {
3117     PetscInt start, len, mstart, mlen;
3118     PetscCall(ISStrideGetInfo(iscol, &start, NULL));
3119     PetscCall(ISGetLocalSize(iscol, &len));
3120     PetscCall(MatGetOwnershipRangeColumn(mat, &mstart, &mlen));
3121     if (mstart == start && mlen - mstart == len) lisstride = 1;
3122   }
3123 
3124   PetscCallMPI(MPIU_Allreduce(&lisstride, &gisstride, 1, MPI_INT, MPI_MIN, PetscObjectComm((PetscObject)mat)));
3125   if (gisstride) {
3126     PetscInt N;
3127     PetscCall(MatGetSize(mat, NULL, &N));
3128     PetscCall(ISCreateStride(PETSC_COMM_SELF, N, 0, 1, &iscol_local));
3129     PetscCall(ISSetIdentity(iscol_local));
3130     PetscCall(PetscInfo(mat, "Optimizing for obtaining all columns of the matrix; skipping ISAllGather()\n"));
3131   } else {
3132     PetscInt cbs;
3133     PetscCall(ISGetBlockSize(iscol, &cbs));
3134     PetscCall(ISAllGather(iscol, &iscol_local));
3135     PetscCall(ISSetBlockSize(iscol_local, cbs));
3136   }
3137 
3138   *isseq = iscol_local;
3139   PetscFunctionReturn(PETSC_SUCCESS);
3140 }
3141 
3142 /*
3143  Used by MatCreateSubMatrix_MPIAIJ_SameRowColDist() to avoid ISAllGather() and global size of iscol_local
3144  (see MatCreateSubMatrix_MPIAIJ_nonscalable)
3145 
3146  Input Parameters:
3147 +   mat - matrix
3148 .   isrow - parallel row index set; its local indices are a subset of local columns of `mat`,
3149            i.e., mat->rstart <= isrow[i] < mat->rend
3150 -   iscol - parallel column index set; its local indices are a subset of local columns of `mat`,
3151            i.e., mat->cstart <= iscol[i] < mat->cend
3152 
3153  Output Parameters:
3154 +   isrow_d - sequential row index set for retrieving mat->A
3155 .   iscol_d - sequential  column index set for retrieving mat->A
3156 .   iscol_o - sequential column index set for retrieving mat->B
3157 -   garray - column map; garray[i] indicates global location of iscol_o[i] in `iscol`
3158  */
3159 static PetscErrorCode ISGetSeqIS_SameColDist_Private(Mat mat, IS isrow, IS iscol, IS *isrow_d, IS *iscol_d, IS *iscol_o, PetscInt *garray[])
3160 {
3161   Vec             x, cmap;
3162   const PetscInt *is_idx;
3163   PetscScalar    *xarray, *cmaparray;
3164   PetscInt        ncols, isstart, *idx, m, rstart, *cmap1, count;
3165   Mat_MPIAIJ     *a    = (Mat_MPIAIJ *)mat->data;
3166   Mat             B    = a->B;
3167   Vec             lvec = a->lvec, lcmap;
3168   PetscInt        i, cstart, cend, Bn = B->cmap->N;
3169   MPI_Comm        comm;
3170   VecScatter      Mvctx = a->Mvctx;
3171 
3172   PetscFunctionBegin;
3173   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3174   PetscCall(ISGetLocalSize(iscol, &ncols));
3175 
3176   /* (1) iscol is a sub-column vector of mat, pad it with '-1.' to form a full vector x */
3177   PetscCall(MatCreateVecs(mat, &x, NULL));
3178   PetscCall(VecSet(x, -1.0));
3179   PetscCall(VecDuplicate(x, &cmap));
3180   PetscCall(VecSet(cmap, -1.0));
3181 
3182   /* Get start indices */
3183   PetscCallMPI(MPI_Scan(&ncols, &isstart, 1, MPIU_INT, MPI_SUM, comm));
3184   isstart -= ncols;
3185   PetscCall(MatGetOwnershipRangeColumn(mat, &cstart, &cend));
3186 
3187   PetscCall(ISGetIndices(iscol, &is_idx));
3188   PetscCall(VecGetArray(x, &xarray));
3189   PetscCall(VecGetArray(cmap, &cmaparray));
3190   PetscCall(PetscMalloc1(ncols, &idx));
3191   for (i = 0; i < ncols; i++) {
3192     xarray[is_idx[i] - cstart]    = (PetscScalar)is_idx[i];
3193     cmaparray[is_idx[i] - cstart] = i + isstart;        /* global index of iscol[i] */
3194     idx[i]                        = is_idx[i] - cstart; /* local index of iscol[i]  */
3195   }
3196   PetscCall(VecRestoreArray(x, &xarray));
3197   PetscCall(VecRestoreArray(cmap, &cmaparray));
3198   PetscCall(ISRestoreIndices(iscol, &is_idx));
3199 
3200   /* Get iscol_d */
3201   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, iscol_d));
3202   PetscCall(ISGetBlockSize(iscol, &i));
3203   PetscCall(ISSetBlockSize(*iscol_d, i));
3204 
3205   /* Get isrow_d */
3206   PetscCall(ISGetLocalSize(isrow, &m));
3207   rstart = mat->rmap->rstart;
3208   PetscCall(PetscMalloc1(m, &idx));
3209   PetscCall(ISGetIndices(isrow, &is_idx));
3210   for (i = 0; i < m; i++) idx[i] = is_idx[i] - rstart;
3211   PetscCall(ISRestoreIndices(isrow, &is_idx));
3212 
3213   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, m, idx, PETSC_OWN_POINTER, isrow_d));
3214   PetscCall(ISGetBlockSize(isrow, &i));
3215   PetscCall(ISSetBlockSize(*isrow_d, i));
3216 
3217   /* (2) Scatter x and cmap using aij->Mvctx to get their off-process portions (see MatMult_MPIAIJ) */
3218   PetscCall(VecScatterBegin(Mvctx, x, lvec, INSERT_VALUES, SCATTER_FORWARD));
3219   PetscCall(VecScatterEnd(Mvctx, x, lvec, INSERT_VALUES, SCATTER_FORWARD));
3220 
3221   PetscCall(VecDuplicate(lvec, &lcmap));
3222 
3223   PetscCall(VecScatterBegin(Mvctx, cmap, lcmap, INSERT_VALUES, SCATTER_FORWARD));
3224   PetscCall(VecScatterEnd(Mvctx, cmap, lcmap, INSERT_VALUES, SCATTER_FORWARD));
3225 
3226   /* (3) create sequential iscol_o (a subset of iscol) and isgarray */
3227   /* off-process column indices */
3228   count = 0;
3229   PetscCall(PetscMalloc1(Bn, &idx));
3230   PetscCall(PetscMalloc1(Bn, &cmap1));
3231 
3232   PetscCall(VecGetArray(lvec, &xarray));
3233   PetscCall(VecGetArray(lcmap, &cmaparray));
3234   for (i = 0; i < Bn; i++) {
3235     if (PetscRealPart(xarray[i]) > -1.0) {
3236       idx[count]   = i;                                     /* local column index in off-diagonal part B */
3237       cmap1[count] = (PetscInt)PetscRealPart(cmaparray[i]); /* column index in submat */
3238       count++;
3239     }
3240   }
3241   PetscCall(VecRestoreArray(lvec, &xarray));
3242   PetscCall(VecRestoreArray(lcmap, &cmaparray));
3243 
3244   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, idx, PETSC_COPY_VALUES, iscol_o));
3245   /* cannot ensure iscol_o has same blocksize as iscol! */
3246 
3247   PetscCall(PetscFree(idx));
3248   *garray = cmap1;
3249 
3250   PetscCall(VecDestroy(&x));
3251   PetscCall(VecDestroy(&cmap));
3252   PetscCall(VecDestroy(&lcmap));
3253   PetscFunctionReturn(PETSC_SUCCESS);
3254 }
3255 
3256 /* isrow and iscol have same processor distribution as mat, output *submat is a submatrix of local mat */
3257 PetscErrorCode MatCreateSubMatrix_MPIAIJ_SameRowColDist(Mat mat, IS isrow, IS iscol, MatReuse call, Mat *submat)
3258 {
3259   Mat_MPIAIJ *a = (Mat_MPIAIJ *)mat->data, *asub;
3260   Mat         M = NULL;
3261   MPI_Comm    comm;
3262   IS          iscol_d, isrow_d, iscol_o;
3263   Mat         Asub = NULL, Bsub = NULL;
3264   PetscInt    n;
3265 
3266   PetscFunctionBegin;
3267   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3268 
3269   if (call == MAT_REUSE_MATRIX) {
3270     /* Retrieve isrow_d, iscol_d and iscol_o from submat */
3271     PetscCall(PetscObjectQuery((PetscObject)*submat, "isrow_d", (PetscObject *)&isrow_d));
3272     PetscCheck(isrow_d, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "isrow_d passed in was not used before, cannot reuse");
3273 
3274     PetscCall(PetscObjectQuery((PetscObject)*submat, "iscol_d", (PetscObject *)&iscol_d));
3275     PetscCheck(iscol_d, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_d passed in was not used before, cannot reuse");
3276 
3277     PetscCall(PetscObjectQuery((PetscObject)*submat, "iscol_o", (PetscObject *)&iscol_o));
3278     PetscCheck(iscol_o, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_o passed in was not used before, cannot reuse");
3279 
3280     /* Update diagonal and off-diagonal portions of submat */
3281     asub = (Mat_MPIAIJ *)(*submat)->data;
3282     PetscCall(MatCreateSubMatrix_SeqAIJ(a->A, isrow_d, iscol_d, PETSC_DECIDE, MAT_REUSE_MATRIX, &asub->A));
3283     PetscCall(ISGetLocalSize(iscol_o, &n));
3284     if (n) PetscCall(MatCreateSubMatrix_SeqAIJ(a->B, isrow_d, iscol_o, PETSC_DECIDE, MAT_REUSE_MATRIX, &asub->B));
3285     PetscCall(MatAssemblyBegin(*submat, MAT_FINAL_ASSEMBLY));
3286     PetscCall(MatAssemblyEnd(*submat, MAT_FINAL_ASSEMBLY));
3287 
3288   } else { /* call == MAT_INITIAL_MATRIX) */
3289     PetscInt *garray;
3290     PetscInt  BsubN;
3291 
3292     /* Create isrow_d, iscol_d, iscol_o and isgarray (replace isgarray with array?) */
3293     PetscCall(ISGetSeqIS_SameColDist_Private(mat, isrow, iscol, &isrow_d, &iscol_d, &iscol_o, &garray));
3294 
3295     /* Create local submatrices Asub and Bsub */
3296     PetscCall(MatCreateSubMatrix_SeqAIJ(a->A, isrow_d, iscol_d, PETSC_DECIDE, MAT_INITIAL_MATRIX, &Asub));
3297     PetscCall(MatCreateSubMatrix_SeqAIJ(a->B, isrow_d, iscol_o, PETSC_DECIDE, MAT_INITIAL_MATRIX, &Bsub));
3298 
3299     /* Create submatrix M */
3300     PetscCall(MatCreateMPIAIJWithSeqAIJ(comm, Asub, Bsub, garray, &M));
3301 
3302     /* If Bsub has empty columns, compress iscol_o such that it will retrieve condensed Bsub from a->B during reuse */
3303     asub = (Mat_MPIAIJ *)M->data;
3304 
3305     PetscCall(ISGetLocalSize(iscol_o, &BsubN));
3306     n = asub->B->cmap->N;
3307     if (BsubN > n) {
3308       /* This case can be tested using ~petsc/src/tao/bound/tutorials/runplate2_3 */
3309       const PetscInt *idx;
3310       PetscInt        i, j, *idx_new, *subgarray = asub->garray;
3311       PetscCall(PetscInfo(M, "submatrix Bn %" PetscInt_FMT " != BsubN %" PetscInt_FMT ", update iscol_o\n", n, BsubN));
3312 
3313       PetscCall(PetscMalloc1(n, &idx_new));
3314       j = 0;
3315       PetscCall(ISGetIndices(iscol_o, &idx));
3316       for (i = 0; i < n; i++) {
3317         if (j >= BsubN) break;
3318         while (subgarray[i] > garray[j]) j++;
3319 
3320         if (subgarray[i] == garray[j]) {
3321           idx_new[i] = idx[j++];
3322         } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "subgarray[%" PetscInt_FMT "]=%" PetscInt_FMT " cannot < garray[%" PetscInt_FMT "]=%" PetscInt_FMT, i, subgarray[i], j, garray[j]);
3323       }
3324       PetscCall(ISRestoreIndices(iscol_o, &idx));
3325 
3326       PetscCall(ISDestroy(&iscol_o));
3327       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, idx_new, PETSC_OWN_POINTER, &iscol_o));
3328 
3329     } else if (BsubN < n) {
3330       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Columns of Bsub (%" PetscInt_FMT ") cannot be smaller than B's (%" PetscInt_FMT ")", BsubN, asub->B->cmap->N);
3331     }
3332 
3333     PetscCall(PetscFree(garray));
3334     *submat = M;
3335 
3336     /* Save isrow_d, iscol_d and iscol_o used in processor for next request */
3337     PetscCall(PetscObjectCompose((PetscObject)M, "isrow_d", (PetscObject)isrow_d));
3338     PetscCall(ISDestroy(&isrow_d));
3339 
3340     PetscCall(PetscObjectCompose((PetscObject)M, "iscol_d", (PetscObject)iscol_d));
3341     PetscCall(ISDestroy(&iscol_d));
3342 
3343     PetscCall(PetscObjectCompose((PetscObject)M, "iscol_o", (PetscObject)iscol_o));
3344     PetscCall(ISDestroy(&iscol_o));
3345   }
3346   PetscFunctionReturn(PETSC_SUCCESS);
3347 }
3348 
3349 PetscErrorCode MatCreateSubMatrix_MPIAIJ(Mat mat, IS isrow, IS iscol, MatReuse call, Mat *newmat)
3350 {
3351   IS        iscol_local = NULL, isrow_d;
3352   PetscInt  csize;
3353   PetscInt  n, i, j, start, end;
3354   PetscBool sameRowDist = PETSC_FALSE, sameDist[2], tsameDist[2];
3355   MPI_Comm  comm;
3356 
3357   PetscFunctionBegin;
3358   /* If isrow has same processor distribution as mat,
3359      call MatCreateSubMatrix_MPIAIJ_SameRowDist() to avoid using a hash table with global size of iscol */
3360   if (call == MAT_REUSE_MATRIX) {
3361     PetscCall(PetscObjectQuery((PetscObject)*newmat, "isrow_d", (PetscObject *)&isrow_d));
3362     if (isrow_d) {
3363       sameRowDist  = PETSC_TRUE;
3364       tsameDist[1] = PETSC_TRUE; /* sameColDist */
3365     } else {
3366       PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_local));
3367       if (iscol_local) {
3368         sameRowDist  = PETSC_TRUE;
3369         tsameDist[1] = PETSC_FALSE; /* !sameColDist */
3370       }
3371     }
3372   } else {
3373     /* Check if isrow has same processor distribution as mat */
3374     sameDist[0] = PETSC_FALSE;
3375     PetscCall(ISGetLocalSize(isrow, &n));
3376     if (!n) {
3377       sameDist[0] = PETSC_TRUE;
3378     } else {
3379       PetscCall(ISGetMinMax(isrow, &i, &j));
3380       PetscCall(MatGetOwnershipRange(mat, &start, &end));
3381       if (i >= start && j < end) sameDist[0] = PETSC_TRUE;
3382     }
3383 
3384     /* Check if iscol has same processor distribution as mat */
3385     sameDist[1] = PETSC_FALSE;
3386     PetscCall(ISGetLocalSize(iscol, &n));
3387     if (!n) {
3388       sameDist[1] = PETSC_TRUE;
3389     } else {
3390       PetscCall(ISGetMinMax(iscol, &i, &j));
3391       PetscCall(MatGetOwnershipRangeColumn(mat, &start, &end));
3392       if (i >= start && j < end) sameDist[1] = PETSC_TRUE;
3393     }
3394 
3395     PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3396     PetscCallMPI(MPIU_Allreduce(&sameDist, &tsameDist, 2, MPIU_BOOL, MPI_LAND, comm));
3397     sameRowDist = tsameDist[0];
3398   }
3399 
3400   if (sameRowDist) {
3401     if (tsameDist[1]) { /* sameRowDist & sameColDist */
3402       /* isrow and iscol have same processor distribution as mat */
3403       PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowColDist(mat, isrow, iscol, call, newmat));
3404       PetscFunctionReturn(PETSC_SUCCESS);
3405     } else { /* sameRowDist */
3406       /* isrow has same processor distribution as mat */
3407       if (call == MAT_INITIAL_MATRIX) {
3408         PetscBool sorted;
3409         PetscCall(ISGetSeqIS_Private(mat, iscol, &iscol_local));
3410         PetscCall(ISGetLocalSize(iscol_local, &n)); /* local size of iscol_local = global columns of newmat */
3411         PetscCall(ISGetSize(iscol, &i));
3412         PetscCheck(n == i, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "n %" PetscInt_FMT " != size of iscol %" PetscInt_FMT, n, i);
3413 
3414         PetscCall(ISSorted(iscol_local, &sorted));
3415         if (sorted) {
3416           /* MatCreateSubMatrix_MPIAIJ_SameRowDist() requires iscol_local be sorted; it can have duplicate indices */
3417           PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowDist(mat, isrow, iscol, iscol_local, MAT_INITIAL_MATRIX, newmat));
3418           PetscFunctionReturn(PETSC_SUCCESS);
3419         }
3420       } else { /* call == MAT_REUSE_MATRIX */
3421         IS iscol_sub;
3422         PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_sub));
3423         if (iscol_sub) {
3424           PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowDist(mat, isrow, iscol, NULL, call, newmat));
3425           PetscFunctionReturn(PETSC_SUCCESS);
3426         }
3427       }
3428     }
3429   }
3430 
3431   /* General case: iscol -> iscol_local which has global size of iscol */
3432   if (call == MAT_REUSE_MATRIX) {
3433     PetscCall(PetscObjectQuery((PetscObject)*newmat, "ISAllGather", (PetscObject *)&iscol_local));
3434     PetscCheck(iscol_local, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3435   } else {
3436     if (!iscol_local) PetscCall(ISGetSeqIS_Private(mat, iscol, &iscol_local));
3437   }
3438 
3439   PetscCall(ISGetLocalSize(iscol, &csize));
3440   PetscCall(MatCreateSubMatrix_MPIAIJ_nonscalable(mat, isrow, iscol_local, csize, call, newmat));
3441 
3442   if (call == MAT_INITIAL_MATRIX) {
3443     PetscCall(PetscObjectCompose((PetscObject)*newmat, "ISAllGather", (PetscObject)iscol_local));
3444     PetscCall(ISDestroy(&iscol_local));
3445   }
3446   PetscFunctionReturn(PETSC_SUCCESS);
3447 }
3448 
3449 /*@C
3450   MatCreateMPIAIJWithSeqAIJ - creates a `MATMPIAIJ` matrix using `MATSEQAIJ` matrices that contain the "diagonal"
3451   and "off-diagonal" part of the matrix in CSR format.
3452 
3453   Collective
3454 
3455   Input Parameters:
3456 + comm   - MPI communicator
3457 . A      - "diagonal" portion of matrix
3458 . B      - "off-diagonal" portion of matrix, may have empty columns, will be destroyed by this routine
3459 - garray - global index of `B` columns
3460 
3461   Output Parameter:
3462 . mat - the matrix, with input `A` as its local diagonal matrix
3463 
3464   Level: advanced
3465 
3466   Notes:
3467   See `MatCreateAIJ()` for the definition of "diagonal" and "off-diagonal" portion of the matrix.
3468 
3469   `A` becomes part of output mat, `B` is destroyed by this routine. The user cannot use `A` and `B` anymore.
3470 
3471 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MATSEQAIJ`, `MatCreateMPIAIJWithSplitArrays()`
3472 @*/
3473 PetscErrorCode MatCreateMPIAIJWithSeqAIJ(MPI_Comm comm, Mat A, Mat B, const PetscInt garray[], Mat *mat)
3474 {
3475   Mat_MPIAIJ        *maij;
3476   Mat_SeqAIJ        *b  = (Mat_SeqAIJ *)B->data, *bnew;
3477   PetscInt          *oi = b->i, *oj = b->j, i, nz, col;
3478   const PetscScalar *oa;
3479   Mat                Bnew;
3480   PetscInt           m, n, N;
3481   MatType            mpi_mat_type;
3482 
3483   PetscFunctionBegin;
3484   PetscCall(MatCreate(comm, mat));
3485   PetscCall(MatGetSize(A, &m, &n));
3486   PetscCheck(m == B->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Am %" PetscInt_FMT " != Bm %" PetscInt_FMT, m, B->rmap->N);
3487   PetscCheck(PetscAbs(A->rmap->bs) == PetscAbs(B->rmap->bs), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "A row bs %" PetscInt_FMT " != B row bs %" PetscInt_FMT, A->rmap->bs, B->rmap->bs);
3488   /* remove check below; When B is created using iscol_o from ISGetSeqIS_SameColDist_Private(), its bs may not be same as A */
3489   /* PetscCheck(A->cmap->bs == B->cmap->bs,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"A column bs %" PetscInt_FMT " != B column bs %" PetscInt_FMT,A->cmap->bs,B->cmap->bs); */
3490 
3491   /* Get global columns of mat */
3492   PetscCallMPI(MPIU_Allreduce(&n, &N, 1, MPIU_INT, MPI_SUM, comm));
3493 
3494   PetscCall(MatSetSizes(*mat, m, n, PETSC_DECIDE, N));
3495   /* Determine the type of MPI matrix that should be created from the type of matrix A, which holds the "diagonal" portion. */
3496   PetscCall(MatGetMPIMatType_Private(A, &mpi_mat_type));
3497   PetscCall(MatSetType(*mat, mpi_mat_type));
3498 
3499   if (A->rmap->bs > 1 || A->cmap->bs > 1) PetscCall(MatSetBlockSizes(*mat, A->rmap->bs, A->cmap->bs));
3500   maij = (Mat_MPIAIJ *)(*mat)->data;
3501 
3502   (*mat)->preallocated = PETSC_TRUE;
3503 
3504   PetscCall(PetscLayoutSetUp((*mat)->rmap));
3505   PetscCall(PetscLayoutSetUp((*mat)->cmap));
3506 
3507   /* Set A as diagonal portion of *mat */
3508   maij->A = A;
3509 
3510   nz = oi[m];
3511   for (i = 0; i < nz; i++) {
3512     col   = oj[i];
3513     oj[i] = garray[col];
3514   }
3515 
3516   /* Set Bnew as off-diagonal portion of *mat */
3517   PetscCall(MatSeqAIJGetArrayRead(B, &oa));
3518   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, N, oi, oj, (PetscScalar *)oa, &Bnew));
3519   PetscCall(MatSeqAIJRestoreArrayRead(B, &oa));
3520   bnew        = (Mat_SeqAIJ *)Bnew->data;
3521   bnew->maxnz = b->maxnz; /* allocated nonzeros of B */
3522   maij->B     = Bnew;
3523 
3524   PetscCheck(B->rmap->N == Bnew->rmap->N, PETSC_COMM_SELF, PETSC_ERR_PLIB, "BN %" PetscInt_FMT " != BnewN %" PetscInt_FMT, B->rmap->N, Bnew->rmap->N);
3525 
3526   b->free_a  = PETSC_FALSE;
3527   b->free_ij = PETSC_FALSE;
3528   PetscCall(MatDestroy(&B));
3529 
3530   bnew->free_a  = PETSC_TRUE;
3531   bnew->free_ij = PETSC_TRUE;
3532 
3533   /* condense columns of maij->B */
3534   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
3535   PetscCall(MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY));
3536   PetscCall(MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY));
3537   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE));
3538   PetscCall(MatSetOption(*mat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3539   PetscFunctionReturn(PETSC_SUCCESS);
3540 }
3541 
3542 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, PetscBool, Mat *);
3543 
3544 PetscErrorCode MatCreateSubMatrix_MPIAIJ_SameRowDist(Mat mat, IS isrow, IS iscol, IS iscol_local, MatReuse call, Mat *newmat)
3545 {
3546   PetscInt        i, m, n, rstart, row, rend, nz, j, bs, cbs;
3547   PetscInt       *ii, *jj, nlocal, *dlens, *olens, dlen, olen, jend, mglobal;
3548   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)mat->data;
3549   Mat             M, Msub, B = a->B;
3550   MatScalar      *aa;
3551   Mat_SeqAIJ     *aij;
3552   PetscInt       *garray = a->garray, *colsub, Ncols;
3553   PetscInt        count, Bn = B->cmap->N, cstart = mat->cmap->rstart, cend = mat->cmap->rend;
3554   IS              iscol_sub, iscmap;
3555   const PetscInt *is_idx, *cmap;
3556   PetscBool       allcolumns = PETSC_FALSE;
3557   MPI_Comm        comm;
3558 
3559   PetscFunctionBegin;
3560   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3561   if (call == MAT_REUSE_MATRIX) {
3562     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_sub));
3563     PetscCheck(iscol_sub, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "SubIScol passed in was not used before, cannot reuse");
3564     PetscCall(ISGetLocalSize(iscol_sub, &count));
3565 
3566     PetscCall(PetscObjectQuery((PetscObject)*newmat, "Subcmap", (PetscObject *)&iscmap));
3567     PetscCheck(iscmap, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Subcmap passed in was not used before, cannot reuse");
3568 
3569     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubMatrix", (PetscObject *)&Msub));
3570     PetscCheck(Msub, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3571 
3572     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol_sub, MAT_REUSE_MATRIX, PETSC_FALSE, &Msub));
3573 
3574   } else { /* call == MAT_INITIAL_MATRIX) */
3575     PetscBool flg;
3576 
3577     PetscCall(ISGetLocalSize(iscol, &n));
3578     PetscCall(ISGetSize(iscol, &Ncols));
3579 
3580     /* (1) iscol -> nonscalable iscol_local */
3581     /* Check for special case: each processor gets entire matrix columns */
3582     PetscCall(ISIdentity(iscol_local, &flg));
3583     if (flg && n == mat->cmap->N) allcolumns = PETSC_TRUE;
3584     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &allcolumns, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
3585     if (allcolumns) {
3586       iscol_sub = iscol_local;
3587       PetscCall(PetscObjectReference((PetscObject)iscol_local));
3588       PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &iscmap));
3589 
3590     } else {
3591       /* (2) iscol_local -> iscol_sub and iscmap. Implementation below requires iscol_local be sorted, it can have duplicate indices */
3592       PetscInt *idx, *cmap1, k;
3593       PetscCall(PetscMalloc1(Ncols, &idx));
3594       PetscCall(PetscMalloc1(Ncols, &cmap1));
3595       PetscCall(ISGetIndices(iscol_local, &is_idx));
3596       count = 0;
3597       k     = 0;
3598       for (i = 0; i < Ncols; i++) {
3599         j = is_idx[i];
3600         if (j >= cstart && j < cend) {
3601           /* diagonal part of mat */
3602           idx[count]     = j;
3603           cmap1[count++] = i; /* column index in submat */
3604         } else if (Bn) {
3605           /* off-diagonal part of mat */
3606           if (j == garray[k]) {
3607             idx[count]     = j;
3608             cmap1[count++] = i; /* column index in submat */
3609           } else if (j > garray[k]) {
3610             while (j > garray[k] && k < Bn - 1) k++;
3611             if (j == garray[k]) {
3612               idx[count]     = j;
3613               cmap1[count++] = i; /* column index in submat */
3614             }
3615           }
3616         }
3617       }
3618       PetscCall(ISRestoreIndices(iscol_local, &is_idx));
3619 
3620       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, idx, PETSC_OWN_POINTER, &iscol_sub));
3621       PetscCall(ISGetBlockSize(iscol, &cbs));
3622       PetscCall(ISSetBlockSize(iscol_sub, cbs));
3623 
3624       PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)iscol_local), count, cmap1, PETSC_OWN_POINTER, &iscmap));
3625     }
3626 
3627     /* (3) Create sequential Msub */
3628     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol_sub, MAT_INITIAL_MATRIX, allcolumns, &Msub));
3629   }
3630 
3631   PetscCall(ISGetLocalSize(iscol_sub, &count));
3632   aij = (Mat_SeqAIJ *)Msub->data;
3633   ii  = aij->i;
3634   PetscCall(ISGetIndices(iscmap, &cmap));
3635 
3636   /*
3637       m - number of local rows
3638       Ncols - number of columns (same on all processors)
3639       rstart - first row in new global matrix generated
3640   */
3641   PetscCall(MatGetSize(Msub, &m, NULL));
3642 
3643   if (call == MAT_INITIAL_MATRIX) {
3644     /* (4) Create parallel newmat */
3645     PetscMPIInt rank, size;
3646     PetscInt    csize;
3647 
3648     PetscCallMPI(MPI_Comm_size(comm, &size));
3649     PetscCallMPI(MPI_Comm_rank(comm, &rank));
3650 
3651     /*
3652         Determine the number of non-zeros in the diagonal and off-diagonal
3653         portions of the matrix in order to do correct preallocation
3654     */
3655 
3656     /* first get start and end of "diagonal" columns */
3657     PetscCall(ISGetLocalSize(iscol, &csize));
3658     if (csize == PETSC_DECIDE) {
3659       PetscCall(ISGetSize(isrow, &mglobal));
3660       if (mglobal == Ncols) { /* square matrix */
3661         nlocal = m;
3662       } else {
3663         nlocal = Ncols / size + ((Ncols % size) > rank);
3664       }
3665     } else {
3666       nlocal = csize;
3667     }
3668     PetscCallMPI(MPI_Scan(&nlocal, &rend, 1, MPIU_INT, MPI_SUM, comm));
3669     rstart = rend - nlocal;
3670     PetscCheck(rank != size - 1 || rend == Ncols, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Local column sizes %" PetscInt_FMT " do not add up to total number of columns %" PetscInt_FMT, rend, Ncols);
3671 
3672     /* next, compute all the lengths */
3673     jj = aij->j;
3674     PetscCall(PetscMalloc1(2 * m + 1, &dlens));
3675     olens = dlens + m;
3676     for (i = 0; i < m; i++) {
3677       jend = ii[i + 1] - ii[i];
3678       olen = 0;
3679       dlen = 0;
3680       for (j = 0; j < jend; j++) {
3681         if (cmap[*jj] < rstart || cmap[*jj] >= rend) olen++;
3682         else dlen++;
3683         jj++;
3684       }
3685       olens[i] = olen;
3686       dlens[i] = dlen;
3687     }
3688 
3689     PetscCall(ISGetBlockSize(isrow, &bs));
3690     PetscCall(ISGetBlockSize(iscol, &cbs));
3691 
3692     PetscCall(MatCreate(comm, &M));
3693     PetscCall(MatSetSizes(M, m, nlocal, PETSC_DECIDE, Ncols));
3694     PetscCall(MatSetBlockSizes(M, bs, cbs));
3695     PetscCall(MatSetType(M, ((PetscObject)mat)->type_name));
3696     PetscCall(MatMPIAIJSetPreallocation(M, 0, dlens, 0, olens));
3697     PetscCall(PetscFree(dlens));
3698 
3699   } else { /* call == MAT_REUSE_MATRIX */
3700     M = *newmat;
3701     PetscCall(MatGetLocalSize(M, &i, NULL));
3702     PetscCheck(i == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Previous matrix must be same size/layout as request");
3703     PetscCall(MatZeroEntries(M));
3704     /*
3705          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
3706        rather than the slower MatSetValues().
3707     */
3708     M->was_assembled = PETSC_TRUE;
3709     M->assembled     = PETSC_FALSE;
3710   }
3711 
3712   /* (5) Set values of Msub to *newmat */
3713   PetscCall(PetscMalloc1(count, &colsub));
3714   PetscCall(MatGetOwnershipRange(M, &rstart, NULL));
3715 
3716   jj = aij->j;
3717   PetscCall(MatSeqAIJGetArrayRead(Msub, (const PetscScalar **)&aa));
3718   for (i = 0; i < m; i++) {
3719     row = rstart + i;
3720     nz  = ii[i + 1] - ii[i];
3721     for (j = 0; j < nz; j++) colsub[j] = cmap[jj[j]];
3722     PetscCall(MatSetValues_MPIAIJ(M, 1, &row, nz, colsub, aa, INSERT_VALUES));
3723     jj += nz;
3724     aa += nz;
3725   }
3726   PetscCall(MatSeqAIJRestoreArrayRead(Msub, (const PetscScalar **)&aa));
3727   PetscCall(ISRestoreIndices(iscmap, &cmap));
3728 
3729   PetscCall(MatAssemblyBegin(M, MAT_FINAL_ASSEMBLY));
3730   PetscCall(MatAssemblyEnd(M, MAT_FINAL_ASSEMBLY));
3731 
3732   PetscCall(PetscFree(colsub));
3733 
3734   /* save Msub, iscol_sub and iscmap used in processor for next request */
3735   if (call == MAT_INITIAL_MATRIX) {
3736     *newmat = M;
3737     PetscCall(PetscObjectCompose((PetscObject)*newmat, "SubMatrix", (PetscObject)Msub));
3738     PetscCall(MatDestroy(&Msub));
3739 
3740     PetscCall(PetscObjectCompose((PetscObject)*newmat, "SubIScol", (PetscObject)iscol_sub));
3741     PetscCall(ISDestroy(&iscol_sub));
3742 
3743     PetscCall(PetscObjectCompose((PetscObject)*newmat, "Subcmap", (PetscObject)iscmap));
3744     PetscCall(ISDestroy(&iscmap));
3745 
3746     if (iscol_local) {
3747       PetscCall(PetscObjectCompose((PetscObject)*newmat, "ISAllGather", (PetscObject)iscol_local));
3748       PetscCall(ISDestroy(&iscol_local));
3749     }
3750   }
3751   PetscFunctionReturn(PETSC_SUCCESS);
3752 }
3753 
3754 /*
3755     Not great since it makes two copies of the submatrix, first an SeqAIJ
3756   in local and then by concatenating the local matrices the end result.
3757   Writing it directly would be much like MatCreateSubMatrices_MPIAIJ()
3758 
3759   This requires a sequential iscol with all indices.
3760 */
3761 PetscErrorCode MatCreateSubMatrix_MPIAIJ_nonscalable(Mat mat, IS isrow, IS iscol, PetscInt csize, MatReuse call, Mat *newmat)
3762 {
3763   PetscMPIInt rank, size;
3764   PetscInt    i, m, n, rstart, row, rend, nz, *cwork, j, bs, cbs;
3765   PetscInt   *ii, *jj, nlocal, *dlens, *olens, dlen, olen, jend, mglobal;
3766   Mat         M, Mreuse;
3767   MatScalar  *aa, *vwork;
3768   MPI_Comm    comm;
3769   Mat_SeqAIJ *aij;
3770   PetscBool   colflag, allcolumns = PETSC_FALSE;
3771 
3772   PetscFunctionBegin;
3773   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3774   PetscCallMPI(MPI_Comm_rank(comm, &rank));
3775   PetscCallMPI(MPI_Comm_size(comm, &size));
3776 
3777   /* Check for special case: each processor gets entire matrix columns */
3778   PetscCall(ISIdentity(iscol, &colflag));
3779   PetscCall(ISGetLocalSize(iscol, &n));
3780   if (colflag && n == mat->cmap->N) allcolumns = PETSC_TRUE;
3781   PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &allcolumns, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
3782 
3783   if (call == MAT_REUSE_MATRIX) {
3784     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubMatrix", (PetscObject *)&Mreuse));
3785     PetscCheck(Mreuse, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3786     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol, MAT_REUSE_MATRIX, allcolumns, &Mreuse));
3787   } else {
3788     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol, MAT_INITIAL_MATRIX, allcolumns, &Mreuse));
3789   }
3790 
3791   /*
3792       m - number of local rows
3793       n - number of columns (same on all processors)
3794       rstart - first row in new global matrix generated
3795   */
3796   PetscCall(MatGetSize(Mreuse, &m, &n));
3797   PetscCall(MatGetBlockSizes(Mreuse, &bs, &cbs));
3798   if (call == MAT_INITIAL_MATRIX) {
3799     aij = (Mat_SeqAIJ *)Mreuse->data;
3800     ii  = aij->i;
3801     jj  = aij->j;
3802 
3803     /*
3804         Determine the number of non-zeros in the diagonal and off-diagonal
3805         portions of the matrix in order to do correct preallocation
3806     */
3807 
3808     /* first get start and end of "diagonal" columns */
3809     if (csize == PETSC_DECIDE) {
3810       PetscCall(ISGetSize(isrow, &mglobal));
3811       if (mglobal == n) { /* square matrix */
3812         nlocal = m;
3813       } else {
3814         nlocal = n / size + ((n % size) > rank);
3815       }
3816     } else {
3817       nlocal = csize;
3818     }
3819     PetscCallMPI(MPI_Scan(&nlocal, &rend, 1, MPIU_INT, MPI_SUM, comm));
3820     rstart = rend - nlocal;
3821     PetscCheck(rank != size - 1 || rend == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Local column sizes %" PetscInt_FMT " do not add up to total number of columns %" PetscInt_FMT, rend, n);
3822 
3823     /* next, compute all the lengths */
3824     PetscCall(PetscMalloc1(2 * m + 1, &dlens));
3825     olens = dlens + m;
3826     for (i = 0; i < m; i++) {
3827       jend = ii[i + 1] - ii[i];
3828       olen = 0;
3829       dlen = 0;
3830       for (j = 0; j < jend; j++) {
3831         if (*jj < rstart || *jj >= rend) olen++;
3832         else dlen++;
3833         jj++;
3834       }
3835       olens[i] = olen;
3836       dlens[i] = dlen;
3837     }
3838     PetscCall(MatCreate(comm, &M));
3839     PetscCall(MatSetSizes(M, m, nlocal, PETSC_DECIDE, n));
3840     PetscCall(MatSetBlockSizes(M, bs, cbs));
3841     PetscCall(MatSetType(M, ((PetscObject)mat)->type_name));
3842     PetscCall(MatMPIAIJSetPreallocation(M, 0, dlens, 0, olens));
3843     PetscCall(PetscFree(dlens));
3844   } else {
3845     PetscInt ml, nl;
3846 
3847     M = *newmat;
3848     PetscCall(MatGetLocalSize(M, &ml, &nl));
3849     PetscCheck(ml == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Previous matrix must be same size/layout as request");
3850     PetscCall(MatZeroEntries(M));
3851     /*
3852          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
3853        rather than the slower MatSetValues().
3854     */
3855     M->was_assembled = PETSC_TRUE;
3856     M->assembled     = PETSC_FALSE;
3857   }
3858   PetscCall(MatGetOwnershipRange(M, &rstart, &rend));
3859   aij = (Mat_SeqAIJ *)Mreuse->data;
3860   ii  = aij->i;
3861   jj  = aij->j;
3862 
3863   /* trigger copy to CPU if needed */
3864   PetscCall(MatSeqAIJGetArrayRead(Mreuse, (const PetscScalar **)&aa));
3865   for (i = 0; i < m; i++) {
3866     row   = rstart + i;
3867     nz    = ii[i + 1] - ii[i];
3868     cwork = jj;
3869     jj    = PetscSafePointerPlusOffset(jj, nz);
3870     vwork = aa;
3871     aa    = PetscSafePointerPlusOffset(aa, nz);
3872     PetscCall(MatSetValues_MPIAIJ(M, 1, &row, nz, cwork, vwork, INSERT_VALUES));
3873   }
3874   PetscCall(MatSeqAIJRestoreArrayRead(Mreuse, (const PetscScalar **)&aa));
3875 
3876   PetscCall(MatAssemblyBegin(M, MAT_FINAL_ASSEMBLY));
3877   PetscCall(MatAssemblyEnd(M, MAT_FINAL_ASSEMBLY));
3878   *newmat = M;
3879 
3880   /* save submatrix used in processor for next request */
3881   if (call == MAT_INITIAL_MATRIX) {
3882     PetscCall(PetscObjectCompose((PetscObject)M, "SubMatrix", (PetscObject)Mreuse));
3883     PetscCall(MatDestroy(&Mreuse));
3884   }
3885   PetscFunctionReturn(PETSC_SUCCESS);
3886 }
3887 
3888 static PetscErrorCode MatMPIAIJSetPreallocationCSR_MPIAIJ(Mat B, const PetscInt Ii[], const PetscInt J[], const PetscScalar v[])
3889 {
3890   PetscInt        m, cstart, cend, j, nnz, i, d, *ld;
3891   PetscInt       *d_nnz, *o_nnz, nnz_max = 0, rstart, ii, irstart;
3892   const PetscInt *JJ;
3893   PetscBool       nooffprocentries;
3894   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)B->data;
3895 
3896   PetscFunctionBegin;
3897   PetscCall(PetscLayoutSetUp(B->rmap));
3898   PetscCall(PetscLayoutSetUp(B->cmap));
3899   m       = B->rmap->n;
3900   cstart  = B->cmap->rstart;
3901   cend    = B->cmap->rend;
3902   rstart  = B->rmap->rstart;
3903   irstart = Ii[0];
3904 
3905   PetscCall(PetscCalloc2(m, &d_nnz, m, &o_nnz));
3906 
3907   if (PetscDefined(USE_DEBUG)) {
3908     for (i = 0; i < m; i++) {
3909       nnz = Ii[i + 1] - Ii[i];
3910       JJ  = PetscSafePointerPlusOffset(J, Ii[i] - irstart);
3911       PetscCheck(nnz >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Local row %" PetscInt_FMT " has a negative %" PetscInt_FMT " number of columns", i, nnz);
3912       PetscCheck(!nnz || !(JJ[0] < 0), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Row %" PetscInt_FMT " starts with negative column index %" PetscInt_FMT, i, JJ[0]);
3913       PetscCheck(!nnz || !(JJ[nnz - 1] >= B->cmap->N), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Row %" PetscInt_FMT " ends with too large a column index %" PetscInt_FMT " (max allowed %" PetscInt_FMT ")", i, JJ[nnz - 1], B->cmap->N);
3914     }
3915   }
3916 
3917   for (i = 0; i < m; i++) {
3918     nnz     = Ii[i + 1] - Ii[i];
3919     JJ      = PetscSafePointerPlusOffset(J, Ii[i] - irstart);
3920     nnz_max = PetscMax(nnz_max, nnz);
3921     d       = 0;
3922     for (j = 0; j < nnz; j++) {
3923       if (cstart <= JJ[j] && JJ[j] < cend) d++;
3924     }
3925     d_nnz[i] = d;
3926     o_nnz[i] = nnz - d;
3927   }
3928   PetscCall(MatMPIAIJSetPreallocation(B, 0, d_nnz, 0, o_nnz));
3929   PetscCall(PetscFree2(d_nnz, o_nnz));
3930 
3931   for (i = 0; i < m; i++) {
3932     ii = i + rstart;
3933     PetscCall(MatSetValues_MPIAIJ(B, 1, &ii, Ii[i + 1] - Ii[i], PetscSafePointerPlusOffset(J, Ii[i] - irstart), PetscSafePointerPlusOffset(v, Ii[i] - irstart), INSERT_VALUES));
3934   }
3935   nooffprocentries    = B->nooffprocentries;
3936   B->nooffprocentries = PETSC_TRUE;
3937   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
3938   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
3939   B->nooffprocentries = nooffprocentries;
3940 
3941   /* count number of entries below block diagonal */
3942   PetscCall(PetscFree(Aij->ld));
3943   PetscCall(PetscCalloc1(m, &ld));
3944   Aij->ld = ld;
3945   for (i = 0; i < m; i++) {
3946     nnz = Ii[i + 1] - Ii[i];
3947     j   = 0;
3948     while (j < nnz && J[j] < cstart) j++;
3949     ld[i] = j;
3950     if (J) J += nnz;
3951   }
3952 
3953   PetscCall(MatSetOption(B, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3954   PetscFunctionReturn(PETSC_SUCCESS);
3955 }
3956 
3957 /*@
3958   MatMPIAIJSetPreallocationCSR - Allocates memory for a sparse parallel matrix in `MATAIJ` format
3959   (the default parallel PETSc format).
3960 
3961   Collective
3962 
3963   Input Parameters:
3964 + B - the matrix
3965 . i - the indices into `j` for the start of each local row (indices start with zero)
3966 . j - the column indices for each local row (indices start with zero)
3967 - v - optional values in the matrix
3968 
3969   Level: developer
3970 
3971   Notes:
3972   The `i`, `j`, and `v` arrays ARE copied by this routine into the internal format used by PETSc;
3973   thus you CANNOT change the matrix entries by changing the values of `v` after you have
3974   called this routine. Use `MatCreateMPIAIJWithSplitArrays()` to avoid needing to copy the arrays.
3975 
3976   The `i` and `j` indices are 0 based, and `i` indices are indices corresponding to the local `j` array.
3977 
3978   A convenience routine for this functionality is `MatCreateMPIAIJWithArrays()`.
3979 
3980   You can update the matrix with new numerical values using `MatUpdateMPIAIJWithArrays()` after this call if the column indices in `j` are sorted.
3981 
3982   If you do **not** use `MatUpdateMPIAIJWithArrays()`, the column indices in `j` do not need to be sorted. If you will use
3983   `MatUpdateMPIAIJWithArrays()`, the column indices **must** be sorted.
3984 
3985   The format which is used for the sparse matrix input, is equivalent to a
3986   row-major ordering.. i.e for the following matrix, the input data expected is
3987   as shown
3988 .vb
3989         1 0 0
3990         2 0 3     P0
3991        -------
3992         4 5 6     P1
3993 
3994      Process0 [P0] rows_owned=[0,1]
3995         i =  {0,1,3}  [size = nrow+1  = 2+1]
3996         j =  {0,0,2}  [size = 3]
3997         v =  {1,2,3}  [size = 3]
3998 
3999      Process1 [P1] rows_owned=[2]
4000         i =  {0,3}    [size = nrow+1  = 1+1]
4001         j =  {0,1,2}  [size = 3]
4002         v =  {4,5,6}  [size = 3]
4003 .ve
4004 
4005 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatCreateAIJ()`,
4006           `MatCreateSeqAIJWithArrays()`, `MatCreateMPIAIJWithSplitArrays()`, `MatCreateMPIAIJWithArrays()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4007 @*/
4008 PetscErrorCode MatMPIAIJSetPreallocationCSR(Mat B, const PetscInt i[], const PetscInt j[], const PetscScalar v[])
4009 {
4010   PetscFunctionBegin;
4011   PetscTryMethod(B, "MatMPIAIJSetPreallocationCSR_C", (Mat, const PetscInt[], const PetscInt[], const PetscScalar[]), (B, i, j, v));
4012   PetscFunctionReturn(PETSC_SUCCESS);
4013 }
4014 
4015 /*@
4016   MatMPIAIJSetPreallocation - Preallocates memory for a sparse parallel matrix in `MATMPIAIJ` format
4017   (the default parallel PETSc format).  For good matrix assembly performance
4018   the user should preallocate the matrix storage by setting the parameters
4019   `d_nz` (or `d_nnz`) and `o_nz` (or `o_nnz`).
4020 
4021   Collective
4022 
4023   Input Parameters:
4024 + B     - the matrix
4025 . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
4026            (same value is used for all local rows)
4027 . d_nnz - array containing the number of nonzeros in the various rows of the
4028            DIAGONAL portion of the local submatrix (possibly different for each row)
4029            or `NULL` (`PETSC_NULL_INTEGER` in Fortran), if `d_nz` is used to specify the nonzero structure.
4030            The size of this array is equal to the number of local rows, i.e 'm'.
4031            For matrices that will be factored, you must leave room for (and set)
4032            the diagonal entry even if it is zero.
4033 . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
4034            submatrix (same value is used for all local rows).
4035 - o_nnz - array containing the number of nonzeros in the various rows of the
4036            OFF-DIAGONAL portion of the local submatrix (possibly different for
4037            each row) or `NULL` (`PETSC_NULL_INTEGER` in Fortran), if `o_nz` is used to specify the nonzero
4038            structure. The size of this array is equal to the number
4039            of local rows, i.e 'm'.
4040 
4041   Example Usage:
4042   Consider the following 8x8 matrix with 34 non-zero values, that is
4043   assembled across 3 processors. Lets assume that proc0 owns 3 rows,
4044   proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
4045   as follows
4046 
4047 .vb
4048             1  2  0  |  0  3  0  |  0  4
4049     Proc0   0  5  6  |  7  0  0  |  8  0
4050             9  0 10  | 11  0  0  | 12  0
4051     -------------------------------------
4052            13  0 14  | 15 16 17  |  0  0
4053     Proc1   0 18  0  | 19 20 21  |  0  0
4054             0  0  0  | 22 23  0  | 24  0
4055     -------------------------------------
4056     Proc2  25 26 27  |  0  0 28  | 29  0
4057            30  0  0  | 31 32 33  |  0 34
4058 .ve
4059 
4060   This can be represented as a collection of submatrices as
4061 .vb
4062       A B C
4063       D E F
4064       G H I
4065 .ve
4066 
4067   Where the submatrices A,B,C are owned by proc0, D,E,F are
4068   owned by proc1, G,H,I are owned by proc2.
4069 
4070   The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4071   The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4072   The 'M','N' parameters are 8,8, and have the same values on all procs.
4073 
4074   The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
4075   submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
4076   corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
4077   Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
4078   part as `MATSEQAIJ` matrices. For example, proc1 will store [E] as a `MATSEQAIJ`
4079   matrix, ans [DF] as another `MATSEQAIJ` matrix.
4080 
4081   When `d_nz`, `o_nz` parameters are specified, `d_nz` storage elements are
4082   allocated for every row of the local diagonal submatrix, and `o_nz`
4083   storage locations are allocated for every row of the OFF-DIAGONAL submat.
4084   One way to choose `d_nz` and `o_nz` is to use the max nonzerors per local
4085   rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
4086   In this case, the values of `d_nz`, `o_nz` are
4087 .vb
4088      proc0  dnz = 2, o_nz = 2
4089      proc1  dnz = 3, o_nz = 2
4090      proc2  dnz = 1, o_nz = 4
4091 .ve
4092   We are allocating `m`*(`d_nz`+`o_nz`) storage locations for every proc. This
4093   translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
4094   for proc3. i.e we are using 12+15+10=37 storage locations to store
4095   34 values.
4096 
4097   When `d_nnz`, `o_nnz` parameters are specified, the storage is specified
4098   for every row, corresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
4099   In the above case the values for `d_nnz`, `o_nnz` are
4100 .vb
4101      proc0 d_nnz = [2,2,2] and o_nnz = [2,2,2]
4102      proc1 d_nnz = [3,3,2] and o_nnz = [2,1,1]
4103      proc2 d_nnz = [1,1]   and o_nnz = [4,4]
4104 .ve
4105   Here the space allocated is sum of all the above values i.e 34, and
4106   hence pre-allocation is perfect.
4107 
4108   Level: intermediate
4109 
4110   Notes:
4111   If the *_nnz parameter is given then the *_nz parameter is ignored
4112 
4113   The `MATAIJ` format, also called compressed row storage (CSR), is compatible with standard Fortran
4114   storage.  The stored row and column indices begin with zero.
4115   See [Sparse Matrices](sec_matsparse) for details.
4116 
4117   The parallel matrix is partitioned such that the first m0 rows belong to
4118   process 0, the next m1 rows belong to process 1, the next m2 rows belong
4119   to process 2 etc.. where m0,m1,m2... are the input parameter 'm'.
4120 
4121   The DIAGONAL portion of the local submatrix of a processor can be defined
4122   as the submatrix which is obtained by extraction the part corresponding to
4123   the rows r1-r2 and columns c1-c2 of the global matrix, where r1 is the
4124   first row that belongs to the processor, r2 is the last row belonging to
4125   the this processor, and c1-c2 is range of indices of the local part of a
4126   vector suitable for applying the matrix to.  This is an mxn matrix.  In the
4127   common case of a square matrix, the row and column ranges are the same and
4128   the DIAGONAL part is also square. The remaining portion of the local
4129   submatrix (mxN) constitute the OFF-DIAGONAL portion.
4130 
4131   If `o_nnz` and `d_nnz` are specified, then `o_nz` and `d_nz` are ignored.
4132 
4133   You can call `MatGetInfo()` to get information on how effective the preallocation was;
4134   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
4135   You can also run with the option `-info` and look for messages with the string
4136   malloc in them to see if additional memory allocation was needed.
4137 
4138 .seealso: [](ch_matrices), `Mat`, [Sparse Matrices](sec_matsparse), `MATMPIAIJ`, `MATAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateAIJ()`, `MatMPIAIJSetPreallocationCSR()`,
4139           `MatGetInfo()`, `PetscSplitOwnership()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4140 @*/
4141 PetscErrorCode MatMPIAIJSetPreallocation(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
4142 {
4143   PetscFunctionBegin;
4144   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
4145   PetscValidType(B, 1);
4146   PetscTryMethod(B, "MatMPIAIJSetPreallocation_C", (Mat, PetscInt, const PetscInt[], PetscInt, const PetscInt[]), (B, d_nz, d_nnz, o_nz, o_nnz));
4147   PetscFunctionReturn(PETSC_SUCCESS);
4148 }
4149 
4150 /*@
4151   MatCreateMPIAIJWithArrays - creates a `MATMPIAIJ` matrix using arrays that contain in standard
4152   CSR format for the local rows.
4153 
4154   Collective
4155 
4156   Input Parameters:
4157 + comm - MPI communicator
4158 . m    - number of local rows (Cannot be `PETSC_DECIDE`)
4159 . n    - This value should be the same as the local size used in creating the
4160          x vector for the matrix-vector product $ y = Ax$. (or `PETSC_DECIDE` to have
4161          calculated if `N` is given) For square matrices n is almost always `m`.
4162 . M    - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
4163 . N    - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
4164 . i    - row indices (of length m+1); that is i[0] = 0, i[row] = i[row-1] + number of elements in that row of the matrix
4165 . j    - global column indices
4166 - a    - optional matrix values
4167 
4168   Output Parameter:
4169 . mat - the matrix
4170 
4171   Level: intermediate
4172 
4173   Notes:
4174   The `i`, `j`, and `a` arrays ARE copied by this routine into the internal format used by PETSc;
4175   thus you CANNOT change the matrix entries by changing the values of `a[]` after you have
4176   called this routine. Use `MatCreateMPIAIJWithSplitArrays()` to avoid needing to copy the arrays.
4177 
4178   The `i` and `j` indices are 0 based, and `i` indices are indices corresponding to the local `j` array.
4179 
4180   Once you have created the matrix you can update it with new numerical values using `MatUpdateMPIAIJWithArray()`
4181 
4182   If you do **not** use `MatUpdateMPIAIJWithArray()`, the column indices in `j` do not need to be sorted. If you will use
4183   `MatUpdateMPIAIJWithArrays()`, the column indices **must** be sorted.
4184 
4185   The format which is used for the sparse matrix input, is equivalent to a
4186   row-major ordering, i.e., for the following matrix, the input data expected is
4187   as shown
4188 .vb
4189         1 0 0
4190         2 0 3     P0
4191        -------
4192         4 5 6     P1
4193 
4194      Process0 [P0] rows_owned=[0,1]
4195         i =  {0,1,3}  [size = nrow+1  = 2+1]
4196         j =  {0,0,2}  [size = 3]
4197         v =  {1,2,3}  [size = 3]
4198 
4199      Process1 [P1] rows_owned=[2]
4200         i =  {0,3}    [size = nrow+1  = 1+1]
4201         j =  {0,1,2}  [size = 3]
4202         v =  {4,5,6}  [size = 3]
4203 .ve
4204 
4205 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4206           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArray()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4207 @*/
4208 PetscErrorCode MatCreateMPIAIJWithArrays(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, const PetscInt i[], const PetscInt j[], const PetscScalar a[], Mat *mat)
4209 {
4210   PetscFunctionBegin;
4211   PetscCheck(!i || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
4212   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
4213   PetscCall(MatCreate(comm, mat));
4214   PetscCall(MatSetSizes(*mat, m, n, M, N));
4215   /* PetscCall(MatSetBlockSizes(M,bs,cbs)); */
4216   PetscCall(MatSetType(*mat, MATMPIAIJ));
4217   PetscCall(MatMPIAIJSetPreallocationCSR(*mat, i, j, a));
4218   PetscFunctionReturn(PETSC_SUCCESS);
4219 }
4220 
4221 /*@
4222   MatUpdateMPIAIJWithArrays - updates a `MATMPIAIJ` matrix using arrays that contain in standard
4223   CSR format for the local rows. Only the numerical values are updated the other arrays must be identical to what was passed
4224   from `MatCreateMPIAIJWithArrays()`
4225 
4226   Deprecated: Use `MatUpdateMPIAIJWithArray()`
4227 
4228   Collective
4229 
4230   Input Parameters:
4231 + mat - the matrix
4232 . m   - number of local rows (Cannot be `PETSC_DECIDE`)
4233 . n   - This value should be the same as the local size used in creating the
4234        x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
4235        calculated if N is given) For square matrices n is almost always m.
4236 . M   - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
4237 . N   - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
4238 . Ii  - row indices; that is Ii[0] = 0, Ii[row] = Ii[row-1] + number of elements in that row of the matrix
4239 . J   - column indices
4240 - v   - matrix values
4241 
4242   Level: deprecated
4243 
4244 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4245           `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArray()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4246 @*/
4247 PetscErrorCode MatUpdateMPIAIJWithArrays(Mat mat, PetscInt m, PetscInt n, PetscInt M, PetscInt N, const PetscInt Ii[], const PetscInt J[], const PetscScalar v[])
4248 {
4249   PetscInt        nnz, i;
4250   PetscBool       nooffprocentries;
4251   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)mat->data;
4252   Mat_SeqAIJ     *Ad  = (Mat_SeqAIJ *)Aij->A->data;
4253   PetscScalar    *ad, *ao;
4254   PetscInt        ldi, Iii, md;
4255   const PetscInt *Adi = Ad->i;
4256   PetscInt       *ld  = Aij->ld;
4257 
4258   PetscFunctionBegin;
4259   PetscCheck(Ii[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
4260   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
4261   PetscCheck(m == mat->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Local number of rows cannot change from call to MatUpdateMPIAIJWithArrays()");
4262   PetscCheck(n == mat->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Local number of columns cannot change from call to MatUpdateMPIAIJWithArrays()");
4263 
4264   PetscCall(MatSeqAIJGetArrayWrite(Aij->A, &ad));
4265   PetscCall(MatSeqAIJGetArrayWrite(Aij->B, &ao));
4266 
4267   for (i = 0; i < m; i++) {
4268     if (PetscDefined(USE_DEBUG)) {
4269       for (PetscInt j = Ii[i] + 1; j < Ii[i + 1]; ++j) {
4270         PetscCheck(J[j] >= J[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column entry number %" PetscInt_FMT " (actual column %" PetscInt_FMT ") in row %" PetscInt_FMT " is not sorted", j - Ii[i], J[j], i);
4271         PetscCheck(J[j] != J[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column entry number %" PetscInt_FMT " (actual column %" PetscInt_FMT ") in row %" PetscInt_FMT " is identical to previous entry", j - Ii[i], J[j], i);
4272       }
4273     }
4274     nnz = Ii[i + 1] - Ii[i];
4275     Iii = Ii[i];
4276     ldi = ld[i];
4277     md  = Adi[i + 1] - Adi[i];
4278     PetscCall(PetscArraycpy(ao, v + Iii, ldi));
4279     PetscCall(PetscArraycpy(ad, v + Iii + ldi, md));
4280     PetscCall(PetscArraycpy(ao + ldi, v + Iii + ldi + md, nnz - ldi - md));
4281     ad += md;
4282     ao += nnz - md;
4283   }
4284   nooffprocentries      = mat->nooffprocentries;
4285   mat->nooffprocentries = PETSC_TRUE;
4286   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->A, &ad));
4287   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->B, &ao));
4288   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->A));
4289   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->B));
4290   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
4291   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
4292   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
4293   mat->nooffprocentries = nooffprocentries;
4294   PetscFunctionReturn(PETSC_SUCCESS);
4295 }
4296 
4297 /*@
4298   MatUpdateMPIAIJWithArray - updates an `MATMPIAIJ` matrix using an array that contains the nonzero values
4299 
4300   Collective
4301 
4302   Input Parameters:
4303 + mat - the matrix
4304 - v   - matrix values, stored by row
4305 
4306   Level: intermediate
4307 
4308   Notes:
4309   The matrix must have been obtained with `MatCreateMPIAIJWithArrays()` or `MatMPIAIJSetPreallocationCSR()`
4310 
4311   The column indices in the call to `MatCreateMPIAIJWithArrays()` or `MatMPIAIJSetPreallocationCSR()` must have been sorted for this call to work correctly
4312 
4313 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4314           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArrays()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4315 @*/
4316 PetscErrorCode MatUpdateMPIAIJWithArray(Mat mat, const PetscScalar v[])
4317 {
4318   PetscInt        nnz, i, m;
4319   PetscBool       nooffprocentries;
4320   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)mat->data;
4321   Mat_SeqAIJ     *Ad  = (Mat_SeqAIJ *)Aij->A->data;
4322   Mat_SeqAIJ     *Ao  = (Mat_SeqAIJ *)Aij->B->data;
4323   PetscScalar    *ad, *ao;
4324   const PetscInt *Adi = Ad->i, *Adj = Ao->i;
4325   PetscInt        ldi, Iii, md;
4326   PetscInt       *ld = Aij->ld;
4327 
4328   PetscFunctionBegin;
4329   m = mat->rmap->n;
4330 
4331   PetscCall(MatSeqAIJGetArrayWrite(Aij->A, &ad));
4332   PetscCall(MatSeqAIJGetArrayWrite(Aij->B, &ao));
4333   Iii = 0;
4334   for (i = 0; i < m; i++) {
4335     nnz = Adi[i + 1] - Adi[i] + Adj[i + 1] - Adj[i];
4336     ldi = ld[i];
4337     md  = Adi[i + 1] - Adi[i];
4338     PetscCall(PetscArraycpy(ad, v + Iii + ldi, md));
4339     ad += md;
4340     if (ao) {
4341       PetscCall(PetscArraycpy(ao, v + Iii, ldi));
4342       PetscCall(PetscArraycpy(ao + ldi, v + Iii + ldi + md, nnz - ldi - md));
4343       ao += nnz - md;
4344     }
4345     Iii += nnz;
4346   }
4347   nooffprocentries      = mat->nooffprocentries;
4348   mat->nooffprocentries = PETSC_TRUE;
4349   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->A, &ad));
4350   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->B, &ao));
4351   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->A));
4352   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->B));
4353   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
4354   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
4355   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
4356   mat->nooffprocentries = nooffprocentries;
4357   PetscFunctionReturn(PETSC_SUCCESS);
4358 }
4359 
4360 /*@
4361   MatCreateAIJ - Creates a sparse parallel matrix in `MATAIJ` format
4362   (the default parallel PETSc format).  For good matrix assembly performance
4363   the user should preallocate the matrix storage by setting the parameters
4364   `d_nz` (or `d_nnz`) and `o_nz` (or `o_nnz`).
4365 
4366   Collective
4367 
4368   Input Parameters:
4369 + comm  - MPI communicator
4370 . m     - number of local rows (or `PETSC_DECIDE` to have calculated if M is given)
4371           This value should be the same as the local size used in creating the
4372           y vector for the matrix-vector product y = Ax.
4373 . n     - This value should be the same as the local size used in creating the
4374           x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
4375           calculated if N is given) For square matrices n is almost always m.
4376 . M     - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
4377 . N     - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
4378 . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
4379           (same value is used for all local rows)
4380 . d_nnz - array containing the number of nonzeros in the various rows of the
4381           DIAGONAL portion of the local submatrix (possibly different for each row)
4382           or `NULL`, if `d_nz` is used to specify the nonzero structure.
4383           The size of this array is equal to the number of local rows, i.e 'm'.
4384 . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
4385           submatrix (same value is used for all local rows).
4386 - o_nnz - array containing the number of nonzeros in the various rows of the
4387           OFF-DIAGONAL portion of the local submatrix (possibly different for
4388           each row) or `NULL`, if `o_nz` is used to specify the nonzero
4389           structure. The size of this array is equal to the number
4390           of local rows, i.e 'm'.
4391 
4392   Output Parameter:
4393 . A - the matrix
4394 
4395   Options Database Keys:
4396 + -mat_no_inode                     - Do not use inodes
4397 . -mat_inode_limit <limit>          - Sets inode limit (max limit=5)
4398 - -matmult_vecscatter_view <viewer> - View the vecscatter (i.e., communication pattern) used in `MatMult()` of sparse parallel matrices.
4399                                       See viewer types in manual of `MatView()`. Of them, ascii_matlab, draw or binary cause the `VecScatter`
4400                                       to be viewed as a matrix. Entry (i,j) is the size of message (in bytes) rank i sends to rank j in one `MatMult()` call.
4401 
4402   Level: intermediate
4403 
4404   Notes:
4405   It is recommended that one use `MatCreateFromOptions()` or the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
4406   MatXXXXSetPreallocation() paradigm instead of this routine directly.
4407   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
4408 
4409   If the *_nnz parameter is given then the *_nz parameter is ignored
4410 
4411   The `m`,`n`,`M`,`N` parameters specify the size of the matrix, and its partitioning across
4412   processors, while `d_nz`,`d_nnz`,`o_nz`,`o_nnz` parameters specify the approximate
4413   storage requirements for this matrix.
4414 
4415   If `PETSC_DECIDE` or  `PETSC_DETERMINE` is used for a particular argument on one
4416   processor than it must be used on all processors that share the object for
4417   that argument.
4418 
4419   If `m` and `n` are not `PETSC_DECIDE`, then the values determine the `PetscLayout` of the matrix and the ranges returned by
4420   `MatGetOwnershipRange()`, `MatGetOwnershipRanges()`, `MatGetOwnershipRangeColumn()`, and `MatGetOwnershipRangesColumn()`.
4421 
4422   The user MUST specify either the local or global matrix dimensions
4423   (possibly both).
4424 
4425   The parallel matrix is partitioned across processors such that the
4426   first `m0` rows belong to process 0, the next `m1` rows belong to
4427   process 1, the next `m2` rows belong to process 2, etc., where
4428   `m0`, `m1`, `m2`... are the input parameter `m` on each MPI process. I.e., each MPI process stores
4429   values corresponding to [m x N] submatrix.
4430 
4431   The columns are logically partitioned with the n0 columns belonging
4432   to 0th partition, the next n1 columns belonging to the next
4433   partition etc.. where n0,n1,n2... are the input parameter 'n'.
4434 
4435   The DIAGONAL portion of the local submatrix on any given processor
4436   is the submatrix corresponding to the rows and columns m,n
4437   corresponding to the given processor. i.e diagonal matrix on
4438   process 0 is [m0 x n0], diagonal matrix on process 1 is [m1 x n1]
4439   etc. The remaining portion of the local submatrix [m x (N-n)]
4440   constitute the OFF-DIAGONAL portion. The example below better
4441   illustrates this concept.
4442 
4443   For a square global matrix we define each processor's diagonal portion
4444   to be its local rows and the corresponding columns (a square submatrix);
4445   each processor's off-diagonal portion encompasses the remainder of the
4446   local matrix (a rectangular submatrix).
4447 
4448   If `o_nnz`, `d_nnz` are specified, then `o_nz`, and `d_nz` are ignored.
4449 
4450   When calling this routine with a single process communicator, a matrix of
4451   type `MATSEQAIJ` is returned.  If a matrix of type `MATMPIAIJ` is desired for this
4452   type of communicator, use the construction mechanism
4453 .vb
4454   MatCreate(..., &A);
4455   MatSetType(A, MATMPIAIJ);
4456   MatSetSizes(A, m, n, M, N);
4457   MatMPIAIJSetPreallocation(A, ...);
4458 .ve
4459 
4460   By default, this format uses inodes (identical nodes) when possible.
4461   We search for consecutive rows with the same nonzero structure, thereby
4462   reusing matrix information to achieve increased efficiency.
4463 
4464   Example Usage:
4465   Consider the following 8x8 matrix with 34 non-zero values, that is
4466   assembled across 3 processors. Lets assume that proc0 owns 3 rows,
4467   proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
4468   as follows
4469 
4470 .vb
4471             1  2  0  |  0  3  0  |  0  4
4472     Proc0   0  5  6  |  7  0  0  |  8  0
4473             9  0 10  | 11  0  0  | 12  0
4474     -------------------------------------
4475            13  0 14  | 15 16 17  |  0  0
4476     Proc1   0 18  0  | 19 20 21  |  0  0
4477             0  0  0  | 22 23  0  | 24  0
4478     -------------------------------------
4479     Proc2  25 26 27  |  0  0 28  | 29  0
4480            30  0  0  | 31 32 33  |  0 34
4481 .ve
4482 
4483   This can be represented as a collection of submatrices as
4484 
4485 .vb
4486       A B C
4487       D E F
4488       G H I
4489 .ve
4490 
4491   Where the submatrices A,B,C are owned by proc0, D,E,F are
4492   owned by proc1, G,H,I are owned by proc2.
4493 
4494   The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4495   The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4496   The 'M','N' parameters are 8,8, and have the same values on all procs.
4497 
4498   The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
4499   submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
4500   corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
4501   Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
4502   part as `MATSEQAIJ` matrices. For example, proc1 will store [E] as a `MATSEQAIJ`
4503   matrix, ans [DF] as another SeqAIJ matrix.
4504 
4505   When `d_nz`, `o_nz` parameters are specified, `d_nz` storage elements are
4506   allocated for every row of the local diagonal submatrix, and `o_nz`
4507   storage locations are allocated for every row of the OFF-DIAGONAL submat.
4508   One way to choose `d_nz` and `o_nz` is to use the max nonzerors per local
4509   rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
4510   In this case, the values of `d_nz`,`o_nz` are
4511 .vb
4512      proc0  dnz = 2, o_nz = 2
4513      proc1  dnz = 3, o_nz = 2
4514      proc2  dnz = 1, o_nz = 4
4515 .ve
4516   We are allocating m*(`d_nz`+`o_nz`) storage locations for every proc. This
4517   translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
4518   for proc3. i.e we are using 12+15+10=37 storage locations to store
4519   34 values.
4520 
4521   When `d_nnz`, `o_nnz` parameters are specified, the storage is specified
4522   for every row, corresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
4523   In the above case the values for d_nnz,o_nnz are
4524 .vb
4525      proc0 d_nnz = [2,2,2] and o_nnz = [2,2,2]
4526      proc1 d_nnz = [3,3,2] and o_nnz = [2,1,1]
4527      proc2 d_nnz = [1,1]   and o_nnz = [4,4]
4528 .ve
4529   Here the space allocated is sum of all the above values i.e 34, and
4530   hence pre-allocation is perfect.
4531 
4532 .seealso: [](ch_matrices), `Mat`, [Sparse Matrix Creation](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4533           `MATMPIAIJ`, `MatCreateMPIAIJWithArrays()`, `MatGetOwnershipRange()`, `MatGetOwnershipRanges()`, `MatGetOwnershipRangeColumn()`,
4534           `MatGetOwnershipRangesColumn()`, `PetscLayout`
4535 @*/
4536 PetscErrorCode MatCreateAIJ(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
4537 {
4538   PetscMPIInt size;
4539 
4540   PetscFunctionBegin;
4541   PetscCall(MatCreate(comm, A));
4542   PetscCall(MatSetSizes(*A, m, n, M, N));
4543   PetscCallMPI(MPI_Comm_size(comm, &size));
4544   if (size > 1) {
4545     PetscCall(MatSetType(*A, MATMPIAIJ));
4546     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
4547   } else {
4548     PetscCall(MatSetType(*A, MATSEQAIJ));
4549     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
4550   }
4551   PetscFunctionReturn(PETSC_SUCCESS);
4552 }
4553 
4554 /*MC
4555     MatMPIAIJGetSeqAIJF90 - Returns the local pieces of this distributed matrix
4556 
4557     Synopsis:
4558     MatMPIAIJGetSeqAIJF90(Mat A, Mat Ad, Mat Ao, {PetscInt, pointer :: colmap(:)},integer ierr)
4559 
4560     Not Collective
4561 
4562     Input Parameter:
4563 .   A - the `MATMPIAIJ` matrix
4564 
4565     Output Parameters:
4566 +   Ad - the diagonal portion of the matrix
4567 .   Ao - the off-diagonal portion of the matrix
4568 .   colmap - An array mapping local column numbers of `Ao` to global column numbers of the parallel matrix
4569 -   ierr - error code
4570 
4571      Level: advanced
4572 
4573     Note:
4574     Use  `MatMPIAIJRestoreSeqAIJF90()` when you no longer need access to the matrices and `colmap`
4575 
4576 .seealso: [](ch_matrices), `Mat`, [](sec_fortranarrays), `Mat`, `MATMPIAIJ`, `MatMPIAIJGetSeqAIJ()`, `MatMPIAIJRestoreSeqAIJF90()`
4577 M*/
4578 
4579 /*MC
4580     MatMPIAIJRestoreSeqAIJF90 - call after `MatMPIAIJGetSeqAIJF90()` when you no longer need access to the matrices and `colmap`
4581 
4582     Synopsis:
4583     MatMPIAIJRestoreSeqAIJF90(Mat A, Mat Ad, Mat Ao, {PetscInt, pointer :: colmap(:)},integer ierr)
4584 
4585     Not Collective
4586 
4587     Input Parameters:
4588 +   A - the `MATMPIAIJ` matrix
4589 .   Ad - the diagonal portion of the matrix
4590 .   Ao - the off-diagonal portion of the matrix
4591 .   colmap - An array mapping local column numbers of `Ao` to global column numbers of the parallel matrix
4592 -   ierr - error code
4593 
4594      Level: advanced
4595 
4596 .seealso: [](ch_matrices), `Mat`, [](sec_fortranarrays), `Mat`, `MATMPIAIJ`, `MatMPIAIJGetSeqAIJ()`, `MatMPIAIJGetSeqAIJF90()`
4597 M*/
4598 
4599 /*@C
4600   MatMPIAIJGetSeqAIJ - Returns the local pieces of this distributed matrix
4601 
4602   Not Collective
4603 
4604   Input Parameter:
4605 . A - The `MATMPIAIJ` matrix
4606 
4607   Output Parameters:
4608 + Ad     - The local diagonal block as a `MATSEQAIJ` matrix
4609 . Ao     - The local off-diagonal block as a `MATSEQAIJ` matrix
4610 - colmap - An array mapping local column numbers of `Ao` to global column numbers of the parallel matrix
4611 
4612   Level: intermediate
4613 
4614   Note:
4615   The rows in `Ad` and `Ao` are in [0, Nr), where Nr is the number of local rows on this process. The columns
4616   in `Ad` are in [0, Nc) where Nc is the number of local columns. The columns are `Ao` are in [0, Nco), where Nco is
4617   the number of nonzero columns in the local off-diagonal piece of the matrix `A`. The array colmap maps these
4618   local column numbers to global column numbers in the original matrix.
4619 
4620   Fortran Notes:
4621   `MatMPIAIJGetSeqAIJ()` Fortran binding is deprecated (since PETSc 3.19), use `MatMPIAIJGetSeqAIJF90()`
4622 
4623 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatMPIAIJGetSeqAIJF90()`, `MatMPIAIJRestoreSeqAIJF90()`, `MatMPIAIJGetLocalMat()`, `MatMPIAIJGetLocalMatCondensed()`, `MatCreateAIJ()`, `MATSEQAIJ`
4624 @*/
4625 PetscErrorCode MatMPIAIJGetSeqAIJ(Mat A, Mat *Ad, Mat *Ao, const PetscInt *colmap[])
4626 {
4627   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
4628   PetscBool   flg;
4629 
4630   PetscFunctionBegin;
4631   PetscCall(PetscStrbeginswith(((PetscObject)A)->type_name, MATMPIAIJ, &flg));
4632   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "This function requires a MATMPIAIJ matrix as input");
4633   if (Ad) *Ad = a->A;
4634   if (Ao) *Ao = a->B;
4635   if (colmap) *colmap = a->garray;
4636   PetscFunctionReturn(PETSC_SUCCESS);
4637 }
4638 
4639 PetscErrorCode MatCreateMPIMatConcatenateSeqMat_MPIAIJ(MPI_Comm comm, Mat inmat, PetscInt n, MatReuse scall, Mat *outmat)
4640 {
4641   PetscInt     m, N, i, rstart, nnz, Ii;
4642   PetscInt    *indx;
4643   PetscScalar *values;
4644   MatType      rootType;
4645 
4646   PetscFunctionBegin;
4647   PetscCall(MatGetSize(inmat, &m, &N));
4648   if (scall == MAT_INITIAL_MATRIX) { /* symbolic phase */
4649     PetscInt *dnz, *onz, sum, bs, cbs;
4650 
4651     if (n == PETSC_DECIDE) PetscCall(PetscSplitOwnership(comm, &n, &N));
4652     /* Check sum(n) = N */
4653     PetscCallMPI(MPIU_Allreduce(&n, &sum, 1, MPIU_INT, MPI_SUM, comm));
4654     PetscCheck(sum == N, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Sum of local columns %" PetscInt_FMT " != global columns %" PetscInt_FMT, sum, N);
4655 
4656     PetscCallMPI(MPI_Scan(&m, &rstart, 1, MPIU_INT, MPI_SUM, comm));
4657     rstart -= m;
4658 
4659     MatPreallocateBegin(comm, m, n, dnz, onz);
4660     for (i = 0; i < m; i++) {
4661       PetscCall(MatGetRow_SeqAIJ(inmat, i, &nnz, &indx, NULL));
4662       PetscCall(MatPreallocateSet(i + rstart, nnz, indx, dnz, onz));
4663       PetscCall(MatRestoreRow_SeqAIJ(inmat, i, &nnz, &indx, NULL));
4664     }
4665 
4666     PetscCall(MatCreate(comm, outmat));
4667     PetscCall(MatSetSizes(*outmat, m, n, PETSC_DETERMINE, PETSC_DETERMINE));
4668     PetscCall(MatGetBlockSizes(inmat, &bs, &cbs));
4669     PetscCall(MatSetBlockSizes(*outmat, bs, cbs));
4670     PetscCall(MatGetRootType_Private(inmat, &rootType));
4671     PetscCall(MatSetType(*outmat, rootType));
4672     PetscCall(MatSeqAIJSetPreallocation(*outmat, 0, dnz));
4673     PetscCall(MatMPIAIJSetPreallocation(*outmat, 0, dnz, 0, onz));
4674     MatPreallocateEnd(dnz, onz);
4675     PetscCall(MatSetOption(*outmat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
4676   }
4677 
4678   /* numeric phase */
4679   PetscCall(MatGetOwnershipRange(*outmat, &rstart, NULL));
4680   for (i = 0; i < m; i++) {
4681     PetscCall(MatGetRow_SeqAIJ(inmat, i, &nnz, &indx, &values));
4682     Ii = i + rstart;
4683     PetscCall(MatSetValues(*outmat, 1, &Ii, nnz, indx, values, INSERT_VALUES));
4684     PetscCall(MatRestoreRow_SeqAIJ(inmat, i, &nnz, &indx, &values));
4685   }
4686   PetscCall(MatAssemblyBegin(*outmat, MAT_FINAL_ASSEMBLY));
4687   PetscCall(MatAssemblyEnd(*outmat, MAT_FINAL_ASSEMBLY));
4688   PetscFunctionReturn(PETSC_SUCCESS);
4689 }
4690 
4691 static PetscErrorCode MatDestroy_MPIAIJ_SeqsToMPI(void **data)
4692 {
4693   Mat_Merge_SeqsToMPI *merge = (Mat_Merge_SeqsToMPI *)*data;
4694 
4695   PetscFunctionBegin;
4696   if (!merge) PetscFunctionReturn(PETSC_SUCCESS);
4697   PetscCall(PetscFree(merge->id_r));
4698   PetscCall(PetscFree(merge->len_s));
4699   PetscCall(PetscFree(merge->len_r));
4700   PetscCall(PetscFree(merge->bi));
4701   PetscCall(PetscFree(merge->bj));
4702   PetscCall(PetscFree(merge->buf_ri[0]));
4703   PetscCall(PetscFree(merge->buf_ri));
4704   PetscCall(PetscFree(merge->buf_rj[0]));
4705   PetscCall(PetscFree(merge->buf_rj));
4706   PetscCall(PetscFree(merge->coi));
4707   PetscCall(PetscFree(merge->coj));
4708   PetscCall(PetscFree(merge->owners_co));
4709   PetscCall(PetscLayoutDestroy(&merge->rowmap));
4710   PetscCall(PetscFree(merge));
4711   PetscFunctionReturn(PETSC_SUCCESS);
4712 }
4713 
4714 #include <../src/mat/utils/freespace.h>
4715 #include <petscbt.h>
4716 
4717 PetscErrorCode MatCreateMPIAIJSumSeqAIJNumeric(Mat seqmat, Mat mpimat)
4718 {
4719   MPI_Comm             comm;
4720   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)seqmat->data;
4721   PetscMPIInt          size, rank, taga, *len_s;
4722   PetscInt             N = mpimat->cmap->N, i, j, *owners, *ai = a->i, *aj, m;
4723   PetscMPIInt          proc, k;
4724   PetscInt           **buf_ri, **buf_rj;
4725   PetscInt             anzi, *bj_i, *bi, *bj, arow, bnzi, nextaj;
4726   PetscInt             nrows, **buf_ri_k, **nextrow, **nextai;
4727   MPI_Request         *s_waits, *r_waits;
4728   MPI_Status          *status;
4729   const MatScalar     *aa, *a_a;
4730   MatScalar          **abuf_r, *ba_i;
4731   Mat_Merge_SeqsToMPI *merge;
4732   PetscContainer       container;
4733 
4734   PetscFunctionBegin;
4735   PetscCall(PetscObjectGetComm((PetscObject)mpimat, &comm));
4736   PetscCall(PetscLogEventBegin(MAT_Seqstompinum, seqmat, 0, 0, 0));
4737 
4738   PetscCallMPI(MPI_Comm_size(comm, &size));
4739   PetscCallMPI(MPI_Comm_rank(comm, &rank));
4740 
4741   PetscCall(PetscObjectQuery((PetscObject)mpimat, "MatMergeSeqsToMPI", (PetscObject *)&container));
4742   PetscCheck(container, PetscObjectComm((PetscObject)mpimat), PETSC_ERR_PLIB, "Mat not created from MatCreateMPIAIJSumSeqAIJSymbolic");
4743   PetscCall(PetscContainerGetPointer(container, (void **)&merge));
4744   PetscCall(MatSeqAIJGetArrayRead(seqmat, &a_a));
4745   aa = a_a;
4746 
4747   bi     = merge->bi;
4748   bj     = merge->bj;
4749   buf_ri = merge->buf_ri;
4750   buf_rj = merge->buf_rj;
4751 
4752   PetscCall(PetscMalloc1(size, &status));
4753   owners = merge->rowmap->range;
4754   len_s  = merge->len_s;
4755 
4756   /* send and recv matrix values */
4757   PetscCall(PetscObjectGetNewTag((PetscObject)mpimat, &taga));
4758   PetscCall(PetscPostIrecvScalar(comm, taga, merge->nrecv, merge->id_r, merge->len_r, &abuf_r, &r_waits));
4759 
4760   PetscCall(PetscMalloc1(merge->nsend + 1, &s_waits));
4761   for (proc = 0, k = 0; proc < size; proc++) {
4762     if (!len_s[proc]) continue;
4763     i = owners[proc];
4764     PetscCallMPI(MPIU_Isend(aa + ai[i], len_s[proc], MPIU_MATSCALAR, proc, taga, comm, s_waits + k));
4765     k++;
4766   }
4767 
4768   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, r_waits, status));
4769   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, s_waits, status));
4770   PetscCall(PetscFree(status));
4771 
4772   PetscCall(PetscFree(s_waits));
4773   PetscCall(PetscFree(r_waits));
4774 
4775   /* insert mat values of mpimat */
4776   PetscCall(PetscMalloc1(N, &ba_i));
4777   PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextai));
4778 
4779   for (k = 0; k < merge->nrecv; k++) {
4780     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
4781     nrows       = *buf_ri_k[k];
4782     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
4783     nextai[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
4784   }
4785 
4786   /* set values of ba */
4787   m = merge->rowmap->n;
4788   for (i = 0; i < m; i++) {
4789     arow = owners[rank] + i;
4790     bj_i = bj + bi[i]; /* col indices of the i-th row of mpimat */
4791     bnzi = bi[i + 1] - bi[i];
4792     PetscCall(PetscArrayzero(ba_i, bnzi));
4793 
4794     /* add local non-zero vals of this proc's seqmat into ba */
4795     anzi   = ai[arow + 1] - ai[arow];
4796     aj     = a->j + ai[arow];
4797     aa     = a_a + ai[arow];
4798     nextaj = 0;
4799     for (j = 0; nextaj < anzi; j++) {
4800       if (*(bj_i + j) == aj[nextaj]) { /* bcol == acol */
4801         ba_i[j] += aa[nextaj++];
4802       }
4803     }
4804 
4805     /* add received vals into ba */
4806     for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
4807       /* i-th row */
4808       if (i == *nextrow[k]) {
4809         anzi   = *(nextai[k] + 1) - *nextai[k];
4810         aj     = buf_rj[k] + *nextai[k];
4811         aa     = abuf_r[k] + *nextai[k];
4812         nextaj = 0;
4813         for (j = 0; nextaj < anzi; j++) {
4814           if (*(bj_i + j) == aj[nextaj]) { /* bcol == acol */
4815             ba_i[j] += aa[nextaj++];
4816           }
4817         }
4818         nextrow[k]++;
4819         nextai[k]++;
4820       }
4821     }
4822     PetscCall(MatSetValues(mpimat, 1, &arow, bnzi, bj_i, ba_i, INSERT_VALUES));
4823   }
4824   PetscCall(MatSeqAIJRestoreArrayRead(seqmat, &a_a));
4825   PetscCall(MatAssemblyBegin(mpimat, MAT_FINAL_ASSEMBLY));
4826   PetscCall(MatAssemblyEnd(mpimat, MAT_FINAL_ASSEMBLY));
4827 
4828   PetscCall(PetscFree(abuf_r[0]));
4829   PetscCall(PetscFree(abuf_r));
4830   PetscCall(PetscFree(ba_i));
4831   PetscCall(PetscFree3(buf_ri_k, nextrow, nextai));
4832   PetscCall(PetscLogEventEnd(MAT_Seqstompinum, seqmat, 0, 0, 0));
4833   PetscFunctionReturn(PETSC_SUCCESS);
4834 }
4835 
4836 PetscErrorCode MatCreateMPIAIJSumSeqAIJSymbolic(MPI_Comm comm, Mat seqmat, PetscInt m, PetscInt n, Mat *mpimat)
4837 {
4838   Mat                  B_mpi;
4839   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)seqmat->data;
4840   PetscMPIInt          size, rank, tagi, tagj, *len_s, *len_si, *len_ri;
4841   PetscInt           **buf_rj, **buf_ri, **buf_ri_k;
4842   PetscInt             M = seqmat->rmap->n, N = seqmat->cmap->n, i, *owners, *ai = a->i, *aj = a->j;
4843   PetscInt             len, *dnz, *onz, bs, cbs;
4844   PetscInt             k, anzi, *bi, *bj, *lnk, nlnk, arow, bnzi;
4845   PetscInt             nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextai;
4846   MPI_Request         *si_waits, *sj_waits, *ri_waits, *rj_waits;
4847   MPI_Status          *status;
4848   PetscFreeSpaceList   free_space = NULL, current_space = NULL;
4849   PetscBT              lnkbt;
4850   Mat_Merge_SeqsToMPI *merge;
4851   PetscContainer       container;
4852 
4853   PetscFunctionBegin;
4854   PetscCall(PetscLogEventBegin(MAT_Seqstompisym, seqmat, 0, 0, 0));
4855 
4856   /* make sure it is a PETSc comm */
4857   PetscCall(PetscCommDuplicate(comm, &comm, NULL));
4858   PetscCallMPI(MPI_Comm_size(comm, &size));
4859   PetscCallMPI(MPI_Comm_rank(comm, &rank));
4860 
4861   PetscCall(PetscNew(&merge));
4862   PetscCall(PetscMalloc1(size, &status));
4863 
4864   /* determine row ownership */
4865   PetscCall(PetscLayoutCreate(comm, &merge->rowmap));
4866   PetscCall(PetscLayoutSetLocalSize(merge->rowmap, m));
4867   PetscCall(PetscLayoutSetSize(merge->rowmap, M));
4868   PetscCall(PetscLayoutSetBlockSize(merge->rowmap, 1));
4869   PetscCall(PetscLayoutSetUp(merge->rowmap));
4870   PetscCall(PetscMalloc1(size, &len_si));
4871   PetscCall(PetscMalloc1(size, &merge->len_s));
4872 
4873   m      = merge->rowmap->n;
4874   owners = merge->rowmap->range;
4875 
4876   /* determine the number of messages to send, their lengths */
4877   len_s = merge->len_s;
4878 
4879   len          = 0; /* length of buf_si[] */
4880   merge->nsend = 0;
4881   for (PetscMPIInt proc = 0; proc < size; proc++) {
4882     len_si[proc] = 0;
4883     if (proc == rank) {
4884       len_s[proc] = 0;
4885     } else {
4886       PetscCall(PetscMPIIntCast(owners[proc + 1] - owners[proc] + 1, &len_si[proc]));
4887       PetscCall(PetscMPIIntCast(ai[owners[proc + 1]] - ai[owners[proc]], &len_s[proc])); /* num of rows to be sent to [proc] */
4888     }
4889     if (len_s[proc]) {
4890       merge->nsend++;
4891       nrows = 0;
4892       for (i = owners[proc]; i < owners[proc + 1]; i++) {
4893         if (ai[i + 1] > ai[i]) nrows++;
4894       }
4895       PetscCall(PetscMPIIntCast(2 * (nrows + 1), &len_si[proc]));
4896       len += len_si[proc];
4897     }
4898   }
4899 
4900   /* determine the number and length of messages to receive for ij-structure */
4901   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &merge->nrecv));
4902   PetscCall(PetscGatherMessageLengths2(comm, merge->nsend, merge->nrecv, len_s, len_si, &merge->id_r, &merge->len_r, &len_ri));
4903 
4904   /* post the Irecv of j-structure */
4905   PetscCall(PetscCommGetNewTag(comm, &tagj));
4906   PetscCall(PetscPostIrecvInt(comm, tagj, merge->nrecv, merge->id_r, merge->len_r, &buf_rj, &rj_waits));
4907 
4908   /* post the Isend of j-structure */
4909   PetscCall(PetscMalloc2(merge->nsend, &si_waits, merge->nsend, &sj_waits));
4910 
4911   for (PetscMPIInt proc = 0, k = 0; proc < size; proc++) {
4912     if (!len_s[proc]) continue;
4913     i = owners[proc];
4914     PetscCallMPI(MPIU_Isend(aj + ai[i], len_s[proc], MPIU_INT, proc, tagj, comm, sj_waits + k));
4915     k++;
4916   }
4917 
4918   /* receives and sends of j-structure are complete */
4919   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, rj_waits, status));
4920   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, sj_waits, status));
4921 
4922   /* send and recv i-structure */
4923   PetscCall(PetscCommGetNewTag(comm, &tagi));
4924   PetscCall(PetscPostIrecvInt(comm, tagi, merge->nrecv, merge->id_r, len_ri, &buf_ri, &ri_waits));
4925 
4926   PetscCall(PetscMalloc1(len + 1, &buf_s));
4927   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
4928   for (PetscMPIInt proc = 0, k = 0; proc < size; proc++) {
4929     if (!len_s[proc]) continue;
4930     /* form outgoing message for i-structure:
4931          buf_si[0]:                 nrows to be sent
4932                [1:nrows]:           row index (global)
4933                [nrows+1:2*nrows+1]: i-structure index
4934     */
4935     nrows       = len_si[proc] / 2 - 1;
4936     buf_si_i    = buf_si + nrows + 1;
4937     buf_si[0]   = nrows;
4938     buf_si_i[0] = 0;
4939     nrows       = 0;
4940     for (i = owners[proc]; i < owners[proc + 1]; i++) {
4941       anzi = ai[i + 1] - ai[i];
4942       if (anzi) {
4943         buf_si_i[nrows + 1] = buf_si_i[nrows] + anzi; /* i-structure */
4944         buf_si[nrows + 1]   = i - owners[proc];       /* local row index */
4945         nrows++;
4946       }
4947     }
4948     PetscCallMPI(MPIU_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, si_waits + k));
4949     k++;
4950     buf_si += len_si[proc];
4951   }
4952 
4953   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, ri_waits, status));
4954   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, si_waits, status));
4955 
4956   PetscCall(PetscInfo(seqmat, "nsend: %d, nrecv: %d\n", merge->nsend, merge->nrecv));
4957   for (i = 0; i < merge->nrecv; i++) PetscCall(PetscInfo(seqmat, "recv len_ri=%d, len_rj=%d from [%d]\n", len_ri[i], merge->len_r[i], merge->id_r[i]));
4958 
4959   PetscCall(PetscFree(len_si));
4960   PetscCall(PetscFree(len_ri));
4961   PetscCall(PetscFree(rj_waits));
4962   PetscCall(PetscFree2(si_waits, sj_waits));
4963   PetscCall(PetscFree(ri_waits));
4964   PetscCall(PetscFree(buf_s));
4965   PetscCall(PetscFree(status));
4966 
4967   /* compute a local seq matrix in each processor */
4968   /* allocate bi array and free space for accumulating nonzero column info */
4969   PetscCall(PetscMalloc1(m + 1, &bi));
4970   bi[0] = 0;
4971 
4972   /* create and initialize a linked list */
4973   nlnk = N + 1;
4974   PetscCall(PetscLLCreate(N, N, nlnk, lnk, lnkbt));
4975 
4976   /* initial FreeSpace size is 2*(num of local nnz(seqmat)) */
4977   len = ai[owners[rank + 1]] - ai[owners[rank]];
4978   PetscCall(PetscFreeSpaceGet(PetscIntMultTruncate(2, len) + 1, &free_space));
4979 
4980   current_space = free_space;
4981 
4982   /* determine symbolic info for each local row */
4983   PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextai));
4984 
4985   for (k = 0; k < merge->nrecv; k++) {
4986     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
4987     nrows       = *buf_ri_k[k];
4988     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
4989     nextai[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
4990   }
4991 
4992   MatPreallocateBegin(comm, m, n, dnz, onz);
4993   len = 0;
4994   for (i = 0; i < m; i++) {
4995     bnzi = 0;
4996     /* add local non-zero cols of this proc's seqmat into lnk */
4997     arow = owners[rank] + i;
4998     anzi = ai[arow + 1] - ai[arow];
4999     aj   = a->j + ai[arow];
5000     PetscCall(PetscLLAddSorted(anzi, aj, N, &nlnk, lnk, lnkbt));
5001     bnzi += nlnk;
5002     /* add received col data into lnk */
5003     for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
5004       if (i == *nextrow[k]) {            /* i-th row */
5005         anzi = *(nextai[k] + 1) - *nextai[k];
5006         aj   = buf_rj[k] + *nextai[k];
5007         PetscCall(PetscLLAddSorted(anzi, aj, N, &nlnk, lnk, lnkbt));
5008         bnzi += nlnk;
5009         nextrow[k]++;
5010         nextai[k]++;
5011       }
5012     }
5013     if (len < bnzi) len = bnzi; /* =max(bnzi) */
5014 
5015     /* if free space is not available, make more free space */
5016     if (current_space->local_remaining < bnzi) PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(bnzi, current_space->total_array_size), &current_space));
5017     /* copy data into free space, then initialize lnk */
5018     PetscCall(PetscLLClean(N, N, bnzi, lnk, current_space->array, lnkbt));
5019     PetscCall(MatPreallocateSet(i + owners[rank], bnzi, current_space->array, dnz, onz));
5020 
5021     current_space->array += bnzi;
5022     current_space->local_used += bnzi;
5023     current_space->local_remaining -= bnzi;
5024 
5025     bi[i + 1] = bi[i] + bnzi;
5026   }
5027 
5028   PetscCall(PetscFree3(buf_ri_k, nextrow, nextai));
5029 
5030   PetscCall(PetscMalloc1(bi[m] + 1, &bj));
5031   PetscCall(PetscFreeSpaceContiguous(&free_space, bj));
5032   PetscCall(PetscLLDestroy(lnk, lnkbt));
5033 
5034   /* create symbolic parallel matrix B_mpi */
5035   PetscCall(MatGetBlockSizes(seqmat, &bs, &cbs));
5036   PetscCall(MatCreate(comm, &B_mpi));
5037   if (n == PETSC_DECIDE) {
5038     PetscCall(MatSetSizes(B_mpi, m, n, PETSC_DETERMINE, N));
5039   } else {
5040     PetscCall(MatSetSizes(B_mpi, m, n, PETSC_DETERMINE, PETSC_DETERMINE));
5041   }
5042   PetscCall(MatSetBlockSizes(B_mpi, bs, cbs));
5043   PetscCall(MatSetType(B_mpi, MATMPIAIJ));
5044   PetscCall(MatMPIAIJSetPreallocation(B_mpi, 0, dnz, 0, onz));
5045   MatPreallocateEnd(dnz, onz);
5046   PetscCall(MatSetOption(B_mpi, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE));
5047 
5048   /* B_mpi is not ready for use - assembly will be done by MatCreateMPIAIJSumSeqAIJNumeric() */
5049   B_mpi->assembled = PETSC_FALSE;
5050   merge->bi        = bi;
5051   merge->bj        = bj;
5052   merge->buf_ri    = buf_ri;
5053   merge->buf_rj    = buf_rj;
5054   merge->coi       = NULL;
5055   merge->coj       = NULL;
5056   merge->owners_co = NULL;
5057 
5058   PetscCall(PetscCommDestroy(&comm));
5059 
5060   /* attach the supporting struct to B_mpi for reuse */
5061   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container));
5062   PetscCall(PetscContainerSetPointer(container, merge));
5063   PetscCall(PetscContainerSetCtxDestroy(container, MatDestroy_MPIAIJ_SeqsToMPI));
5064   PetscCall(PetscObjectCompose((PetscObject)B_mpi, "MatMergeSeqsToMPI", (PetscObject)container));
5065   PetscCall(PetscContainerDestroy(&container));
5066   *mpimat = B_mpi;
5067 
5068   PetscCall(PetscLogEventEnd(MAT_Seqstompisym, seqmat, 0, 0, 0));
5069   PetscFunctionReturn(PETSC_SUCCESS);
5070 }
5071 
5072 /*@
5073   MatCreateMPIAIJSumSeqAIJ - Creates a `MATMPIAIJ` matrix by adding sequential
5074   matrices from each processor
5075 
5076   Collective
5077 
5078   Input Parameters:
5079 + comm   - the communicators the parallel matrix will live on
5080 . seqmat - the input sequential matrices
5081 . m      - number of local rows (or `PETSC_DECIDE`)
5082 . n      - number of local columns (or `PETSC_DECIDE`)
5083 - scall  - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5084 
5085   Output Parameter:
5086 . mpimat - the parallel matrix generated
5087 
5088   Level: advanced
5089 
5090   Note:
5091   The dimensions of the sequential matrix in each processor MUST be the same.
5092   The input seqmat is included into the container "Mat_Merge_SeqsToMPI", and will be
5093   destroyed when `mpimat` is destroyed. Call `PetscObjectQuery()` to access `seqmat`.
5094 
5095 .seealso: [](ch_matrices), `Mat`, `MatCreateAIJ()`
5096 @*/
5097 PetscErrorCode MatCreateMPIAIJSumSeqAIJ(MPI_Comm comm, Mat seqmat, PetscInt m, PetscInt n, MatReuse scall, Mat *mpimat)
5098 {
5099   PetscMPIInt size;
5100 
5101   PetscFunctionBegin;
5102   PetscCallMPI(MPI_Comm_size(comm, &size));
5103   if (size == 1) {
5104     PetscCall(PetscLogEventBegin(MAT_Seqstompi, seqmat, 0, 0, 0));
5105     if (scall == MAT_INITIAL_MATRIX) {
5106       PetscCall(MatDuplicate(seqmat, MAT_COPY_VALUES, mpimat));
5107     } else {
5108       PetscCall(MatCopy(seqmat, *mpimat, SAME_NONZERO_PATTERN));
5109     }
5110     PetscCall(PetscLogEventEnd(MAT_Seqstompi, seqmat, 0, 0, 0));
5111     PetscFunctionReturn(PETSC_SUCCESS);
5112   }
5113   PetscCall(PetscLogEventBegin(MAT_Seqstompi, seqmat, 0, 0, 0));
5114   if (scall == MAT_INITIAL_MATRIX) PetscCall(MatCreateMPIAIJSumSeqAIJSymbolic(comm, seqmat, m, n, mpimat));
5115   PetscCall(MatCreateMPIAIJSumSeqAIJNumeric(seqmat, *mpimat));
5116   PetscCall(PetscLogEventEnd(MAT_Seqstompi, seqmat, 0, 0, 0));
5117   PetscFunctionReturn(PETSC_SUCCESS);
5118 }
5119 
5120 /*@
5121   MatAIJGetLocalMat - Creates a `MATSEQAIJ` from a `MATAIJ` matrix.
5122 
5123   Not Collective
5124 
5125   Input Parameter:
5126 . A - the matrix
5127 
5128   Output Parameter:
5129 . A_loc - the local sequential matrix generated
5130 
5131   Level: developer
5132 
5133   Notes:
5134   The matrix is created by taking `A`'s local rows and putting them into a sequential matrix
5135   with `mlocal` rows and `n` columns. Where `mlocal` is obtained with `MatGetLocalSize()` and
5136   `n` is the global column count obtained with `MatGetSize()`
5137 
5138   In other words combines the two parts of a parallel `MATMPIAIJ` matrix on each process to a single matrix.
5139 
5140   For parallel matrices this creates an entirely new matrix. If the matrix is sequential it merely increases the reference count.
5141 
5142   Destroy the matrix with `MatDestroy()`
5143 
5144 .seealso: [](ch_matrices), `Mat`, `MatMPIAIJGetLocalMat()`
5145 @*/
5146 PetscErrorCode MatAIJGetLocalMat(Mat A, Mat *A_loc)
5147 {
5148   PetscBool mpi;
5149 
5150   PetscFunctionBegin;
5151   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIAIJ, &mpi));
5152   if (mpi) {
5153     PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, A_loc));
5154   } else {
5155     *A_loc = A;
5156     PetscCall(PetscObjectReference((PetscObject)*A_loc));
5157   }
5158   PetscFunctionReturn(PETSC_SUCCESS);
5159 }
5160 
5161 /*@
5162   MatMPIAIJGetLocalMat - Creates a `MATSEQAIJ` from a `MATMPIAIJ` matrix.
5163 
5164   Not Collective
5165 
5166   Input Parameters:
5167 + A     - the matrix
5168 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5169 
5170   Output Parameter:
5171 . A_loc - the local sequential matrix generated
5172 
5173   Level: developer
5174 
5175   Notes:
5176   The matrix is created by taking all `A`'s local rows and putting them into a sequential
5177   matrix with `mlocal` rows and `n` columns.`mlocal` is the row count obtained with
5178   `MatGetLocalSize()` and `n` is the global column count obtained with `MatGetSize()`.
5179 
5180   In other words combines the two parts of a parallel `MATMPIAIJ` matrix on each process to a single matrix.
5181 
5182   When `A` is sequential and `MAT_INITIAL_MATRIX` is requested, the matrix returned is the diagonal part of `A` (which contains the entire matrix),
5183   with its reference count increased by one. Hence changing values of `A_loc` changes `A`. If `MAT_REUSE_MATRIX` is requested on a sequential matrix
5184   then `MatCopy`(Adiag,*`A_loc`,`SAME_NONZERO_PATTERN`) is called to fill `A_loc`. Thus one can preallocate the appropriate sequential matrix `A_loc`
5185   and then call this routine with `MAT_REUSE_MATRIX`. In this case, one can modify the values of `A_loc` without affecting the original sequential matrix.
5186 
5187 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMatCondensed()`, `MatMPIAIJGetLocalMatMerge()`
5188 @*/
5189 PetscErrorCode MatMPIAIJGetLocalMat(Mat A, MatReuse scall, Mat *A_loc)
5190 {
5191   Mat_MPIAIJ        *mpimat = (Mat_MPIAIJ *)A->data;
5192   Mat_SeqAIJ        *mat, *a, *b;
5193   PetscInt          *ai, *aj, *bi, *bj, *cmap = mpimat->garray;
5194   const PetscScalar *aa, *ba, *aav, *bav;
5195   PetscScalar       *ca, *cam;
5196   PetscMPIInt        size;
5197   PetscInt           am = A->rmap->n, i, j, k, cstart = A->cmap->rstart;
5198   PetscInt          *ci, *cj, col, ncols_d, ncols_o, jo;
5199   PetscBool          match;
5200 
5201   PetscFunctionBegin;
5202   PetscCall(PetscStrbeginswith(((PetscObject)A)->type_name, MATMPIAIJ, &match));
5203   PetscCheck(match, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Requires MATMPIAIJ matrix as input");
5204   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
5205   if (size == 1) {
5206     if (scall == MAT_INITIAL_MATRIX) {
5207       PetscCall(PetscObjectReference((PetscObject)mpimat->A));
5208       *A_loc = mpimat->A;
5209     } else if (scall == MAT_REUSE_MATRIX) {
5210       PetscCall(MatCopy(mpimat->A, *A_loc, SAME_NONZERO_PATTERN));
5211     }
5212     PetscFunctionReturn(PETSC_SUCCESS);
5213   }
5214 
5215   PetscCall(PetscLogEventBegin(MAT_Getlocalmat, A, 0, 0, 0));
5216   a  = (Mat_SeqAIJ *)mpimat->A->data;
5217   b  = (Mat_SeqAIJ *)mpimat->B->data;
5218   ai = a->i;
5219   aj = a->j;
5220   bi = b->i;
5221   bj = b->j;
5222   PetscCall(MatSeqAIJGetArrayRead(mpimat->A, &aav));
5223   PetscCall(MatSeqAIJGetArrayRead(mpimat->B, &bav));
5224   aa = aav;
5225   ba = bav;
5226   if (scall == MAT_INITIAL_MATRIX) {
5227     PetscCall(PetscMalloc1(1 + am, &ci));
5228     ci[0] = 0;
5229     for (i = 0; i < am; i++) ci[i + 1] = ci[i] + (ai[i + 1] - ai[i]) + (bi[i + 1] - bi[i]);
5230     PetscCall(PetscMalloc1(1 + ci[am], &cj));
5231     PetscCall(PetscMalloc1(1 + ci[am], &ca));
5232     k = 0;
5233     for (i = 0; i < am; i++) {
5234       ncols_o = bi[i + 1] - bi[i];
5235       ncols_d = ai[i + 1] - ai[i];
5236       /* off-diagonal portion of A */
5237       for (jo = 0; jo < ncols_o; jo++) {
5238         col = cmap[*bj];
5239         if (col >= cstart) break;
5240         cj[k] = col;
5241         bj++;
5242         ca[k++] = *ba++;
5243       }
5244       /* diagonal portion of A */
5245       for (j = 0; j < ncols_d; j++) {
5246         cj[k]   = cstart + *aj++;
5247         ca[k++] = *aa++;
5248       }
5249       /* off-diagonal portion of A */
5250       for (j = jo; j < ncols_o; j++) {
5251         cj[k]   = cmap[*bj++];
5252         ca[k++] = *ba++;
5253       }
5254     }
5255     /* put together the new matrix */
5256     PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, A->cmap->N, ci, cj, ca, A_loc));
5257     /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5258     /* Since these are PETSc arrays, change flags to free them as necessary. */
5259     mat          = (Mat_SeqAIJ *)(*A_loc)->data;
5260     mat->free_a  = PETSC_TRUE;
5261     mat->free_ij = PETSC_TRUE;
5262     mat->nonew   = 0;
5263   } else if (scall == MAT_REUSE_MATRIX) {
5264     mat = (Mat_SeqAIJ *)(*A_loc)->data;
5265     ci  = mat->i;
5266     cj  = mat->j;
5267     PetscCall(MatSeqAIJGetArrayWrite(*A_loc, &cam));
5268     for (i = 0; i < am; i++) {
5269       /* off-diagonal portion of A */
5270       ncols_o = bi[i + 1] - bi[i];
5271       for (jo = 0; jo < ncols_o; jo++) {
5272         col = cmap[*bj];
5273         if (col >= cstart) break;
5274         *cam++ = *ba++;
5275         bj++;
5276       }
5277       /* diagonal portion of A */
5278       ncols_d = ai[i + 1] - ai[i];
5279       for (j = 0; j < ncols_d; j++) *cam++ = *aa++;
5280       /* off-diagonal portion of A */
5281       for (j = jo; j < ncols_o; j++) {
5282         *cam++ = *ba++;
5283         bj++;
5284       }
5285     }
5286     PetscCall(MatSeqAIJRestoreArrayWrite(*A_loc, &cam));
5287   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid MatReuse %d", (int)scall);
5288   PetscCall(MatSeqAIJRestoreArrayRead(mpimat->A, &aav));
5289   PetscCall(MatSeqAIJRestoreArrayRead(mpimat->B, &bav));
5290   PetscCall(PetscLogEventEnd(MAT_Getlocalmat, A, 0, 0, 0));
5291   PetscFunctionReturn(PETSC_SUCCESS);
5292 }
5293 
5294 /*@
5295   MatMPIAIJGetLocalMatMerge - Creates a `MATSEQAIJ` from a `MATMPIAIJ` matrix by taking all its local rows and putting them into a sequential matrix with
5296   mlocal rows and n columns. Where n is the sum of the number of columns of the diagonal and off-diagonal part
5297 
5298   Not Collective
5299 
5300   Input Parameters:
5301 + A     - the matrix
5302 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5303 
5304   Output Parameters:
5305 + glob  - sequential `IS` with global indices associated with the columns of the local sequential matrix generated (can be `NULL`)
5306 - A_loc - the local sequential matrix generated
5307 
5308   Level: developer
5309 
5310   Note:
5311   This is different from `MatMPIAIJGetLocalMat()` since the first columns in the returning matrix are those associated with the diagonal
5312   part, then those associated with the off-diagonal part (in its local ordering)
5313 
5314 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMat()`, `MatMPIAIJGetLocalMatCondensed()`
5315 @*/
5316 PetscErrorCode MatMPIAIJGetLocalMatMerge(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
5317 {
5318   Mat             Ao, Ad;
5319   const PetscInt *cmap;
5320   PetscMPIInt     size;
5321   PetscErrorCode (*f)(Mat, MatReuse, IS *, Mat *);
5322 
5323   PetscFunctionBegin;
5324   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
5325   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
5326   if (size == 1) {
5327     if (scall == MAT_INITIAL_MATRIX) {
5328       PetscCall(PetscObjectReference((PetscObject)Ad));
5329       *A_loc = Ad;
5330     } else if (scall == MAT_REUSE_MATRIX) {
5331       PetscCall(MatCopy(Ad, *A_loc, SAME_NONZERO_PATTERN));
5332     }
5333     if (glob) PetscCall(ISCreateStride(PetscObjectComm((PetscObject)Ad), Ad->cmap->n, Ad->cmap->rstart, 1, glob));
5334     PetscFunctionReturn(PETSC_SUCCESS);
5335   }
5336   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", &f));
5337   PetscCall(PetscLogEventBegin(MAT_Getlocalmat, A, 0, 0, 0));
5338   if (f) {
5339     PetscCall((*f)(A, scall, glob, A_loc));
5340   } else {
5341     Mat_SeqAIJ        *a = (Mat_SeqAIJ *)Ad->data;
5342     Mat_SeqAIJ        *b = (Mat_SeqAIJ *)Ao->data;
5343     Mat_SeqAIJ        *c;
5344     PetscInt          *ai = a->i, *aj = a->j;
5345     PetscInt          *bi = b->i, *bj = b->j;
5346     PetscInt          *ci, *cj;
5347     const PetscScalar *aa, *ba;
5348     PetscScalar       *ca;
5349     PetscInt           i, j, am, dn, on;
5350 
5351     PetscCall(MatGetLocalSize(Ad, &am, &dn));
5352     PetscCall(MatGetLocalSize(Ao, NULL, &on));
5353     PetscCall(MatSeqAIJGetArrayRead(Ad, &aa));
5354     PetscCall(MatSeqAIJGetArrayRead(Ao, &ba));
5355     if (scall == MAT_INITIAL_MATRIX) {
5356       PetscInt k;
5357       PetscCall(PetscMalloc1(1 + am, &ci));
5358       PetscCall(PetscMalloc1(ai[am] + bi[am], &cj));
5359       PetscCall(PetscMalloc1(ai[am] + bi[am], &ca));
5360       ci[0] = 0;
5361       for (i = 0, k = 0; i < am; i++) {
5362         const PetscInt ncols_o = bi[i + 1] - bi[i];
5363         const PetscInt ncols_d = ai[i + 1] - ai[i];
5364         ci[i + 1]              = ci[i] + ncols_o + ncols_d;
5365         /* diagonal portion of A */
5366         for (j = 0; j < ncols_d; j++, k++) {
5367           cj[k] = *aj++;
5368           ca[k] = *aa++;
5369         }
5370         /* off-diagonal portion of A */
5371         for (j = 0; j < ncols_o; j++, k++) {
5372           cj[k] = dn + *bj++;
5373           ca[k] = *ba++;
5374         }
5375       }
5376       /* put together the new matrix */
5377       PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, dn + on, ci, cj, ca, A_loc));
5378       /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5379       /* Since these are PETSc arrays, change flags to free them as necessary. */
5380       c          = (Mat_SeqAIJ *)(*A_loc)->data;
5381       c->free_a  = PETSC_TRUE;
5382       c->free_ij = PETSC_TRUE;
5383       c->nonew   = 0;
5384       PetscCall(MatSetType(*A_loc, ((PetscObject)Ad)->type_name));
5385     } else if (scall == MAT_REUSE_MATRIX) {
5386       PetscCall(MatSeqAIJGetArrayWrite(*A_loc, &ca));
5387       for (i = 0; i < am; i++) {
5388         const PetscInt ncols_d = ai[i + 1] - ai[i];
5389         const PetscInt ncols_o = bi[i + 1] - bi[i];
5390         /* diagonal portion of A */
5391         for (j = 0; j < ncols_d; j++) *ca++ = *aa++;
5392         /* off-diagonal portion of A */
5393         for (j = 0; j < ncols_o; j++) *ca++ = *ba++;
5394       }
5395       PetscCall(MatSeqAIJRestoreArrayWrite(*A_loc, &ca));
5396     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid MatReuse %d", (int)scall);
5397     PetscCall(MatSeqAIJRestoreArrayRead(Ad, &aa));
5398     PetscCall(MatSeqAIJRestoreArrayRead(Ao, &aa));
5399     if (glob) {
5400       PetscInt cst, *gidx;
5401 
5402       PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
5403       PetscCall(PetscMalloc1(dn + on, &gidx));
5404       for (i = 0; i < dn; i++) gidx[i] = cst + i;
5405       for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
5406       PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
5407     }
5408   }
5409   PetscCall(PetscLogEventEnd(MAT_Getlocalmat, A, 0, 0, 0));
5410   PetscFunctionReturn(PETSC_SUCCESS);
5411 }
5412 
5413 /*@C
5414   MatMPIAIJGetLocalMatCondensed - Creates a `MATSEQAIJ` matrix from an `MATMPIAIJ` matrix by taking all its local rows and NON-ZERO columns
5415 
5416   Not Collective
5417 
5418   Input Parameters:
5419 + A     - the matrix
5420 . scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5421 . row   - index set of rows to extract (or `NULL`)
5422 - col   - index set of columns to extract (or `NULL`)
5423 
5424   Output Parameter:
5425 . A_loc - the local sequential matrix generated
5426 
5427   Level: developer
5428 
5429 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMat()`
5430 @*/
5431 PetscErrorCode MatMPIAIJGetLocalMatCondensed(Mat A, MatReuse scall, IS *row, IS *col, Mat *A_loc)
5432 {
5433   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
5434   PetscInt    i, start, end, ncols, nzA, nzB, *cmap, imark, *idx;
5435   IS          isrowa, iscola;
5436   Mat        *aloc;
5437   PetscBool   match;
5438 
5439   PetscFunctionBegin;
5440   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIAIJ, &match));
5441   PetscCheck(match, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Requires MATMPIAIJ matrix as input");
5442   PetscCall(PetscLogEventBegin(MAT_Getlocalmatcondensed, A, 0, 0, 0));
5443   if (!row) {
5444     start = A->rmap->rstart;
5445     end   = A->rmap->rend;
5446     PetscCall(ISCreateStride(PETSC_COMM_SELF, end - start, start, 1, &isrowa));
5447   } else {
5448     isrowa = *row;
5449   }
5450   if (!col) {
5451     start = A->cmap->rstart;
5452     cmap  = a->garray;
5453     nzA   = a->A->cmap->n;
5454     nzB   = a->B->cmap->n;
5455     PetscCall(PetscMalloc1(nzA + nzB, &idx));
5456     ncols = 0;
5457     for (i = 0; i < nzB; i++) {
5458       if (cmap[i] < start) idx[ncols++] = cmap[i];
5459       else break;
5460     }
5461     imark = i;
5462     for (i = 0; i < nzA; i++) idx[ncols++] = start + i;
5463     for (i = imark; i < nzB; i++) idx[ncols++] = cmap[i];
5464     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, &iscola));
5465   } else {
5466     iscola = *col;
5467   }
5468   if (scall != MAT_INITIAL_MATRIX) {
5469     PetscCall(PetscMalloc1(1, &aloc));
5470     aloc[0] = *A_loc;
5471   }
5472   PetscCall(MatCreateSubMatrices(A, 1, &isrowa, &iscola, scall, &aloc));
5473   if (!col) { /* attach global id of condensed columns */
5474     PetscCall(PetscObjectCompose((PetscObject)aloc[0], "_petsc_GetLocalMatCondensed_iscol", (PetscObject)iscola));
5475   }
5476   *A_loc = aloc[0];
5477   PetscCall(PetscFree(aloc));
5478   if (!row) PetscCall(ISDestroy(&isrowa));
5479   if (!col) PetscCall(ISDestroy(&iscola));
5480   PetscCall(PetscLogEventEnd(MAT_Getlocalmatcondensed, A, 0, 0, 0));
5481   PetscFunctionReturn(PETSC_SUCCESS);
5482 }
5483 
5484 /*
5485  * Create a sequential AIJ matrix based on row indices. a whole column is extracted once a row is matched.
5486  * Row could be local or remote.The routine is designed to be scalable in memory so that nothing is based
5487  * on a global size.
5488  * */
5489 static PetscErrorCode MatCreateSeqSubMatrixWithRows_Private(Mat P, IS rows, Mat *P_oth)
5490 {
5491   Mat_MPIAIJ            *p  = (Mat_MPIAIJ *)P->data;
5492   Mat_SeqAIJ            *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data, *p_oth;
5493   PetscInt               plocalsize, nrows, *ilocal, *oilocal, i, lidx, *nrcols, *nlcols, ncol;
5494   PetscMPIInt            owner;
5495   PetscSFNode           *iremote, *oiremote;
5496   const PetscInt        *lrowindices;
5497   PetscSF                sf, osf;
5498   PetscInt               pcstart, *roffsets, *loffsets, *pnnz, j;
5499   PetscInt               ontotalcols, dntotalcols, ntotalcols, nout;
5500   MPI_Comm               comm;
5501   ISLocalToGlobalMapping mapping;
5502   const PetscScalar     *pd_a, *po_a;
5503 
5504   PetscFunctionBegin;
5505   PetscCall(PetscObjectGetComm((PetscObject)P, &comm));
5506   /* plocalsize is the number of roots
5507    * nrows is the number of leaves
5508    * */
5509   PetscCall(MatGetLocalSize(P, &plocalsize, NULL));
5510   PetscCall(ISGetLocalSize(rows, &nrows));
5511   PetscCall(PetscCalloc1(nrows, &iremote));
5512   PetscCall(ISGetIndices(rows, &lrowindices));
5513   for (i = 0; i < nrows; i++) {
5514     /* Find a remote index and an owner for a row
5515      * The row could be local or remote
5516      * */
5517     owner = 0;
5518     lidx  = 0;
5519     PetscCall(PetscLayoutFindOwnerIndex(P->rmap, lrowindices[i], &owner, &lidx));
5520     iremote[i].index = lidx;
5521     iremote[i].rank  = owner;
5522   }
5523   /* Create SF to communicate how many nonzero columns for each row */
5524   PetscCall(PetscSFCreate(comm, &sf));
5525   /* SF will figure out the number of nonzero columns for each row, and their
5526    * offsets
5527    * */
5528   PetscCall(PetscSFSetGraph(sf, plocalsize, nrows, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
5529   PetscCall(PetscSFSetFromOptions(sf));
5530   PetscCall(PetscSFSetUp(sf));
5531 
5532   PetscCall(PetscCalloc1(2 * (plocalsize + 1), &roffsets));
5533   PetscCall(PetscCalloc1(2 * plocalsize, &nrcols));
5534   PetscCall(PetscCalloc1(nrows, &pnnz));
5535   roffsets[0] = 0;
5536   roffsets[1] = 0;
5537   for (i = 0; i < plocalsize; i++) {
5538     /* diagonal */
5539     nrcols[i * 2 + 0] = pd->i[i + 1] - pd->i[i];
5540     /* off-diagonal */
5541     nrcols[i * 2 + 1] = po->i[i + 1] - po->i[i];
5542     /* compute offsets so that we relative location for each row */
5543     roffsets[(i + 1) * 2 + 0] = roffsets[i * 2 + 0] + nrcols[i * 2 + 0];
5544     roffsets[(i + 1) * 2 + 1] = roffsets[i * 2 + 1] + nrcols[i * 2 + 1];
5545   }
5546   PetscCall(PetscCalloc1(2 * nrows, &nlcols));
5547   PetscCall(PetscCalloc1(2 * nrows, &loffsets));
5548   /* 'r' means root, and 'l' means leaf */
5549   PetscCall(PetscSFBcastBegin(sf, MPIU_2INT, nrcols, nlcols, MPI_REPLACE));
5550   PetscCall(PetscSFBcastBegin(sf, MPIU_2INT, roffsets, loffsets, MPI_REPLACE));
5551   PetscCall(PetscSFBcastEnd(sf, MPIU_2INT, nrcols, nlcols, MPI_REPLACE));
5552   PetscCall(PetscSFBcastEnd(sf, MPIU_2INT, roffsets, loffsets, MPI_REPLACE));
5553   PetscCall(PetscSFDestroy(&sf));
5554   PetscCall(PetscFree(roffsets));
5555   PetscCall(PetscFree(nrcols));
5556   dntotalcols = 0;
5557   ontotalcols = 0;
5558   ncol        = 0;
5559   for (i = 0; i < nrows; i++) {
5560     pnnz[i] = nlcols[i * 2 + 0] + nlcols[i * 2 + 1];
5561     ncol    = PetscMax(pnnz[i], ncol);
5562     /* diagonal */
5563     dntotalcols += nlcols[i * 2 + 0];
5564     /* off-diagonal */
5565     ontotalcols += nlcols[i * 2 + 1];
5566   }
5567   /* We do not need to figure the right number of columns
5568    * since all the calculations will be done by going through the raw data
5569    * */
5570   PetscCall(MatCreateSeqAIJ(PETSC_COMM_SELF, nrows, ncol, 0, pnnz, P_oth));
5571   PetscCall(MatSetUp(*P_oth));
5572   PetscCall(PetscFree(pnnz));
5573   p_oth = (Mat_SeqAIJ *)(*P_oth)->data;
5574   /* diagonal */
5575   PetscCall(PetscCalloc1(dntotalcols, &iremote));
5576   /* off-diagonal */
5577   PetscCall(PetscCalloc1(ontotalcols, &oiremote));
5578   /* diagonal */
5579   PetscCall(PetscCalloc1(dntotalcols, &ilocal));
5580   /* off-diagonal */
5581   PetscCall(PetscCalloc1(ontotalcols, &oilocal));
5582   dntotalcols = 0;
5583   ontotalcols = 0;
5584   ntotalcols  = 0;
5585   for (i = 0; i < nrows; i++) {
5586     owner = 0;
5587     PetscCall(PetscLayoutFindOwnerIndex(P->rmap, lrowindices[i], &owner, NULL));
5588     /* Set iremote for diag matrix */
5589     for (j = 0; j < nlcols[i * 2 + 0]; j++) {
5590       iremote[dntotalcols].index = loffsets[i * 2 + 0] + j;
5591       iremote[dntotalcols].rank  = owner;
5592       /* P_oth is seqAIJ so that ilocal need to point to the first part of memory */
5593       ilocal[dntotalcols++] = ntotalcols++;
5594     }
5595     /* off-diagonal */
5596     for (j = 0; j < nlcols[i * 2 + 1]; j++) {
5597       oiremote[ontotalcols].index = loffsets[i * 2 + 1] + j;
5598       oiremote[ontotalcols].rank  = owner;
5599       oilocal[ontotalcols++]      = ntotalcols++;
5600     }
5601   }
5602   PetscCall(ISRestoreIndices(rows, &lrowindices));
5603   PetscCall(PetscFree(loffsets));
5604   PetscCall(PetscFree(nlcols));
5605   PetscCall(PetscSFCreate(comm, &sf));
5606   /* P serves as roots and P_oth is leaves
5607    * Diag matrix
5608    * */
5609   PetscCall(PetscSFSetGraph(sf, pd->i[plocalsize], dntotalcols, ilocal, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
5610   PetscCall(PetscSFSetFromOptions(sf));
5611   PetscCall(PetscSFSetUp(sf));
5612 
5613   PetscCall(PetscSFCreate(comm, &osf));
5614   /* off-diagonal */
5615   PetscCall(PetscSFSetGraph(osf, po->i[plocalsize], ontotalcols, oilocal, PETSC_OWN_POINTER, oiremote, PETSC_OWN_POINTER));
5616   PetscCall(PetscSFSetFromOptions(osf));
5617   PetscCall(PetscSFSetUp(osf));
5618   PetscCall(MatSeqAIJGetArrayRead(p->A, &pd_a));
5619   PetscCall(MatSeqAIJGetArrayRead(p->B, &po_a));
5620   /* operate on the matrix internal data to save memory */
5621   PetscCall(PetscSFBcastBegin(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5622   PetscCall(PetscSFBcastBegin(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5623   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, NULL));
5624   /* Convert to global indices for diag matrix */
5625   for (i = 0; i < pd->i[plocalsize]; i++) pd->j[i] += pcstart;
5626   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, pd->j, p_oth->j, MPI_REPLACE));
5627   /* We want P_oth store global indices */
5628   PetscCall(ISLocalToGlobalMappingCreate(comm, 1, p->B->cmap->n, p->garray, PETSC_COPY_VALUES, &mapping));
5629   /* Use memory scalable approach */
5630   PetscCall(ISLocalToGlobalMappingSetType(mapping, ISLOCALTOGLOBALMAPPINGHASH));
5631   PetscCall(ISLocalToGlobalMappingApply(mapping, po->i[plocalsize], po->j, po->j));
5632   PetscCall(PetscSFBcastBegin(osf, MPIU_INT, po->j, p_oth->j, MPI_REPLACE));
5633   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, pd->j, p_oth->j, MPI_REPLACE));
5634   /* Convert back to local indices */
5635   for (i = 0; i < pd->i[plocalsize]; i++) pd->j[i] -= pcstart;
5636   PetscCall(PetscSFBcastEnd(osf, MPIU_INT, po->j, p_oth->j, MPI_REPLACE));
5637   nout = 0;
5638   PetscCall(ISGlobalToLocalMappingApply(mapping, IS_GTOLM_DROP, po->i[plocalsize], po->j, &nout, po->j));
5639   PetscCheck(nout == po->i[plocalsize], comm, PETSC_ERR_ARG_INCOMP, "n %" PetscInt_FMT " does not equal to nout %" PetscInt_FMT " ", po->i[plocalsize], nout);
5640   PetscCall(ISLocalToGlobalMappingDestroy(&mapping));
5641   /* Exchange values */
5642   PetscCall(PetscSFBcastEnd(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5643   PetscCall(PetscSFBcastEnd(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5644   PetscCall(MatSeqAIJRestoreArrayRead(p->A, &pd_a));
5645   PetscCall(MatSeqAIJRestoreArrayRead(p->B, &po_a));
5646   /* Stop PETSc from shrinking memory */
5647   for (i = 0; i < nrows; i++) p_oth->ilen[i] = p_oth->imax[i];
5648   PetscCall(MatAssemblyBegin(*P_oth, MAT_FINAL_ASSEMBLY));
5649   PetscCall(MatAssemblyEnd(*P_oth, MAT_FINAL_ASSEMBLY));
5650   /* Attach PetscSF objects to P_oth so that we can reuse it later */
5651   PetscCall(PetscObjectCompose((PetscObject)*P_oth, "diagsf", (PetscObject)sf));
5652   PetscCall(PetscObjectCompose((PetscObject)*P_oth, "offdiagsf", (PetscObject)osf));
5653   PetscCall(PetscSFDestroy(&sf));
5654   PetscCall(PetscSFDestroy(&osf));
5655   PetscFunctionReturn(PETSC_SUCCESS);
5656 }
5657 
5658 /*
5659  * Creates a SeqAIJ matrix by taking rows of B that equal to nonzero columns of local A
5660  * This supports MPIAIJ and MAIJ
5661  * */
5662 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat A, Mat P, PetscInt dof, MatReuse reuse, Mat *P_oth)
5663 {
5664   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
5665   Mat_SeqAIJ *p_oth;
5666   IS          rows, map;
5667   PetscHMapI  hamp;
5668   PetscInt    i, htsize, *rowindices, off, *mapping, key, count;
5669   MPI_Comm    comm;
5670   PetscSF     sf, osf;
5671   PetscBool   has;
5672 
5673   PetscFunctionBegin;
5674   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
5675   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAocols, A, P, 0, 0));
5676   /* If it is the first time, create an index set of off-diag nonzero columns of A,
5677    *  and then create a submatrix (that often is an overlapping matrix)
5678    * */
5679   if (reuse == MAT_INITIAL_MATRIX) {
5680     /* Use a hash table to figure out unique keys */
5681     PetscCall(PetscHMapICreateWithSize(a->B->cmap->n, &hamp));
5682     PetscCall(PetscCalloc1(a->B->cmap->n, &mapping));
5683     count = 0;
5684     /* Assume that  a->g is sorted, otherwise the following does not make sense */
5685     for (i = 0; i < a->B->cmap->n; i++) {
5686       key = a->garray[i] / dof;
5687       PetscCall(PetscHMapIHas(hamp, key, &has));
5688       if (!has) {
5689         mapping[i] = count;
5690         PetscCall(PetscHMapISet(hamp, key, count++));
5691       } else {
5692         /* Current 'i' has the same value the previous step */
5693         mapping[i] = count - 1;
5694       }
5695     }
5696     PetscCall(ISCreateGeneral(comm, a->B->cmap->n, mapping, PETSC_OWN_POINTER, &map));
5697     PetscCall(PetscHMapIGetSize(hamp, &htsize));
5698     PetscCheck(htsize == count, comm, PETSC_ERR_ARG_INCOMP, " Size of hash map %" PetscInt_FMT " is inconsistent with count %" PetscInt_FMT, htsize, count);
5699     PetscCall(PetscCalloc1(htsize, &rowindices));
5700     off = 0;
5701     PetscCall(PetscHMapIGetKeys(hamp, &off, rowindices));
5702     PetscCall(PetscHMapIDestroy(&hamp));
5703     PetscCall(PetscSortInt(htsize, rowindices));
5704     PetscCall(ISCreateGeneral(comm, htsize, rowindices, PETSC_OWN_POINTER, &rows));
5705     /* In case, the matrix was already created but users want to recreate the matrix */
5706     PetscCall(MatDestroy(P_oth));
5707     PetscCall(MatCreateSeqSubMatrixWithRows_Private(P, rows, P_oth));
5708     PetscCall(PetscObjectCompose((PetscObject)*P_oth, "aoffdiagtopothmapping", (PetscObject)map));
5709     PetscCall(ISDestroy(&map));
5710     PetscCall(ISDestroy(&rows));
5711   } else if (reuse == MAT_REUSE_MATRIX) {
5712     /* If matrix was already created, we simply update values using SF objects
5713      * that as attached to the matrix earlier.
5714      */
5715     const PetscScalar *pd_a, *po_a;
5716 
5717     PetscCall(PetscObjectQuery((PetscObject)*P_oth, "diagsf", (PetscObject *)&sf));
5718     PetscCall(PetscObjectQuery((PetscObject)*P_oth, "offdiagsf", (PetscObject *)&osf));
5719     PetscCheck(sf && osf, comm, PETSC_ERR_ARG_NULL, "Matrix is not initialized yet");
5720     p_oth = (Mat_SeqAIJ *)(*P_oth)->data;
5721     /* Update values in place */
5722     PetscCall(MatSeqAIJGetArrayRead(p->A, &pd_a));
5723     PetscCall(MatSeqAIJGetArrayRead(p->B, &po_a));
5724     PetscCall(PetscSFBcastBegin(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5725     PetscCall(PetscSFBcastBegin(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5726     PetscCall(PetscSFBcastEnd(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5727     PetscCall(PetscSFBcastEnd(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5728     PetscCall(MatSeqAIJRestoreArrayRead(p->A, &pd_a));
5729     PetscCall(MatSeqAIJRestoreArrayRead(p->B, &po_a));
5730   } else SETERRQ(comm, PETSC_ERR_ARG_UNKNOWN_TYPE, "Unknown reuse type");
5731   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAocols, A, P, 0, 0));
5732   PetscFunctionReturn(PETSC_SUCCESS);
5733 }
5734 
5735 /*@C
5736   MatGetBrowsOfAcols - Returns `IS` that contain rows of `B` that equal to nonzero columns of local `A`
5737 
5738   Collective
5739 
5740   Input Parameters:
5741 + A     - the first matrix in `MATMPIAIJ` format
5742 . B     - the second matrix in `MATMPIAIJ` format
5743 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5744 
5745   Output Parameters:
5746 + rowb  - On input index sets of rows of B to extract (or `NULL`), modified on output
5747 . colb  - On input index sets of columns of B to extract (or `NULL`), modified on output
5748 - B_seq - the sequential matrix generated
5749 
5750   Level: developer
5751 
5752 .seealso: `Mat`, `MATMPIAIJ`, `IS`, `MatReuse`
5753 @*/
5754 PetscErrorCode MatGetBrowsOfAcols(Mat A, Mat B, MatReuse scall, IS *rowb, IS *colb, Mat *B_seq)
5755 {
5756   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
5757   PetscInt   *idx, i, start, ncols, nzA, nzB, *cmap, imark;
5758   IS          isrowb, iscolb;
5759   Mat        *bseq = NULL;
5760 
5761   PetscFunctionBegin;
5762   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
5763              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
5764   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAcols, A, B, 0, 0));
5765 
5766   if (scall == MAT_INITIAL_MATRIX) {
5767     start = A->cmap->rstart;
5768     cmap  = a->garray;
5769     nzA   = a->A->cmap->n;
5770     nzB   = a->B->cmap->n;
5771     PetscCall(PetscMalloc1(nzA + nzB, &idx));
5772     ncols = 0;
5773     for (i = 0; i < nzB; i++) { /* row < local row index */
5774       if (cmap[i] < start) idx[ncols++] = cmap[i];
5775       else break;
5776     }
5777     imark = i;
5778     for (i = 0; i < nzA; i++) idx[ncols++] = start + i;   /* local rows */
5779     for (i = imark; i < nzB; i++) idx[ncols++] = cmap[i]; /* row > local row index */
5780     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, &isrowb));
5781     PetscCall(ISCreateStride(PETSC_COMM_SELF, B->cmap->N, 0, 1, &iscolb));
5782   } else {
5783     PetscCheck(rowb && colb, PETSC_COMM_SELF, PETSC_ERR_SUP, "IS rowb and colb must be provided for MAT_REUSE_MATRIX");
5784     isrowb = *rowb;
5785     iscolb = *colb;
5786     PetscCall(PetscMalloc1(1, &bseq));
5787     bseq[0] = *B_seq;
5788   }
5789   PetscCall(MatCreateSubMatrices(B, 1, &isrowb, &iscolb, scall, &bseq));
5790   *B_seq = bseq[0];
5791   PetscCall(PetscFree(bseq));
5792   if (!rowb) {
5793     PetscCall(ISDestroy(&isrowb));
5794   } else {
5795     *rowb = isrowb;
5796   }
5797   if (!colb) {
5798     PetscCall(ISDestroy(&iscolb));
5799   } else {
5800     *colb = iscolb;
5801   }
5802   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAcols, A, B, 0, 0));
5803   PetscFunctionReturn(PETSC_SUCCESS);
5804 }
5805 
5806 /*
5807     MatGetBrowsOfAoCols_MPIAIJ - Creates a `MATSEQAIJ` matrix by taking rows of B that equal to nonzero columns
5808     of the OFF-DIAGONAL portion of local A
5809 
5810     Collective
5811 
5812    Input Parameters:
5813 +    A,B - the matrices in `MATMPIAIJ` format
5814 -    scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5815 
5816    Output Parameter:
5817 +    startsj_s - starting point in B's sending j-arrays, saved for MAT_REUSE (or NULL)
5818 .    startsj_r - starting point in B's receiving j-arrays, saved for MAT_REUSE (or NULL)
5819 .    bufa_ptr - array for sending matrix values, saved for MAT_REUSE (or NULL)
5820 -    B_oth - the sequential matrix generated with size aBn=a->B->cmap->n by B->cmap->N
5821 
5822     Developer Note:
5823     This directly accesses information inside the VecScatter associated with the matrix-vector product
5824      for this matrix. This is not desirable..
5825 
5826     Level: developer
5827 
5828 */
5829 
5830 PetscErrorCode MatGetBrowsOfAoCols_MPIAIJ(Mat A, Mat B, MatReuse scall, PetscInt **startsj_s, PetscInt **startsj_r, MatScalar **bufa_ptr, Mat *B_oth)
5831 {
5832   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data;
5833   VecScatter         ctx;
5834   MPI_Comm           comm;
5835   const PetscMPIInt *rprocs, *sprocs;
5836   PetscMPIInt        nrecvs, nsends;
5837   const PetscInt    *srow, *rstarts, *sstarts;
5838   PetscInt          *rowlen, *bufj, *bufJ, ncols = 0, aBn = a->B->cmap->n, row, *b_othi, *b_othj, *rvalues = NULL, *svalues = NULL, *cols, sbs, rbs;
5839   PetscInt           i, j, k = 0, l, ll, nrows, *rstartsj = NULL, *sstartsj, len;
5840   PetscScalar       *b_otha, *bufa, *bufA, *vals = NULL;
5841   MPI_Request       *reqs = NULL, *rwaits = NULL, *swaits = NULL;
5842   PetscMPIInt        size, tag, rank, nreqs;
5843 
5844   PetscFunctionBegin;
5845   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
5846   PetscCallMPI(MPI_Comm_size(comm, &size));
5847 
5848   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
5849              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
5850   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAocols, A, B, 0, 0));
5851   PetscCallMPI(MPI_Comm_rank(comm, &rank));
5852 
5853   if (size == 1) {
5854     startsj_s = NULL;
5855     bufa_ptr  = NULL;
5856     *B_oth    = NULL;
5857     PetscFunctionReturn(PETSC_SUCCESS);
5858   }
5859 
5860   ctx = a->Mvctx;
5861   tag = ((PetscObject)ctx)->tag;
5862 
5863   PetscCall(VecScatterGetRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &srow, &sprocs, &sbs));
5864   /* rprocs[] must be ordered so that indices received from them are ordered in rvalues[], which is key to algorithms used in this subroutine */
5865   PetscCall(VecScatterGetRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL /*indices not needed*/, &rprocs, &rbs));
5866   PetscCall(PetscMPIIntCast(nsends + nrecvs, &nreqs));
5867   PetscCall(PetscMalloc1(nreqs, &reqs));
5868   rwaits = reqs;
5869   swaits = PetscSafePointerPlusOffset(reqs, nrecvs);
5870 
5871   if (!startsj_s || !bufa_ptr) scall = MAT_INITIAL_MATRIX;
5872   if (scall == MAT_INITIAL_MATRIX) {
5873     /* i-array */
5874     /*  post receives */
5875     if (nrecvs) PetscCall(PetscMalloc1(rbs * (rstarts[nrecvs] - rstarts[0]), &rvalues)); /* rstarts can be NULL when nrecvs=0 */
5876     for (i = 0; i < nrecvs; i++) {
5877       rowlen = rvalues + rstarts[i] * rbs;
5878       nrows  = (rstarts[i + 1] - rstarts[i]) * rbs; /* num of indices to be received */
5879       PetscCallMPI(MPIU_Irecv(rowlen, nrows, MPIU_INT, rprocs[i], tag, comm, rwaits + i));
5880     }
5881 
5882     /* pack the outgoing message */
5883     PetscCall(PetscMalloc2(nsends + 1, &sstartsj, nrecvs + 1, &rstartsj));
5884 
5885     sstartsj[0] = 0;
5886     rstartsj[0] = 0;
5887     len         = 0; /* total length of j or a array to be sent */
5888     if (nsends) {
5889       k = sstarts[0]; /* ATTENTION: sstarts[0] and rstarts[0] are not necessarily zero */
5890       PetscCall(PetscMalloc1(sbs * (sstarts[nsends] - sstarts[0]), &svalues));
5891     }
5892     for (i = 0; i < nsends; i++) {
5893       rowlen = svalues + (sstarts[i] - sstarts[0]) * sbs;
5894       nrows  = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5895       for (j = 0; j < nrows; j++) {
5896         row = srow[k] + B->rmap->range[rank]; /* global row idx */
5897         for (l = 0; l < sbs; l++) {
5898           PetscCall(MatGetRow_MPIAIJ(B, row + l, &ncols, NULL, NULL)); /* rowlength */
5899 
5900           rowlen[j * sbs + l] = ncols;
5901 
5902           len += ncols;
5903           PetscCall(MatRestoreRow_MPIAIJ(B, row + l, &ncols, NULL, NULL));
5904         }
5905         k++;
5906       }
5907       PetscCallMPI(MPIU_Isend(rowlen, nrows * sbs, MPIU_INT, sprocs[i], tag, comm, swaits + i));
5908 
5909       sstartsj[i + 1] = len; /* starting point of (i+1)-th outgoing msg in bufj and bufa */
5910     }
5911     /* recvs and sends of i-array are completed */
5912     if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5913     PetscCall(PetscFree(svalues));
5914 
5915     /* allocate buffers for sending j and a arrays */
5916     PetscCall(PetscMalloc1(len + 1, &bufj));
5917     PetscCall(PetscMalloc1(len + 1, &bufa));
5918 
5919     /* create i-array of B_oth */
5920     PetscCall(PetscMalloc1(aBn + 2, &b_othi));
5921 
5922     b_othi[0] = 0;
5923     len       = 0; /* total length of j or a array to be received */
5924     k         = 0;
5925     for (i = 0; i < nrecvs; i++) {
5926       rowlen = rvalues + (rstarts[i] - rstarts[0]) * rbs;
5927       nrows  = (rstarts[i + 1] - rstarts[i]) * rbs; /* num of rows to be received */
5928       for (j = 0; j < nrows; j++) {
5929         b_othi[k + 1] = b_othi[k] + rowlen[j];
5930         PetscCall(PetscIntSumError(rowlen[j], len, &len));
5931         k++;
5932       }
5933       rstartsj[i + 1] = len; /* starting point of (i+1)-th incoming msg in bufj and bufa */
5934     }
5935     PetscCall(PetscFree(rvalues));
5936 
5937     /* allocate space for j and a arrays of B_oth */
5938     PetscCall(PetscMalloc1(b_othi[aBn] + 1, &b_othj));
5939     PetscCall(PetscMalloc1(b_othi[aBn] + 1, &b_otha));
5940 
5941     /* j-array */
5942     /*  post receives of j-array */
5943     for (i = 0; i < nrecvs; i++) {
5944       nrows = rstartsj[i + 1] - rstartsj[i]; /* length of the msg received */
5945       PetscCallMPI(MPIU_Irecv(b_othj + rstartsj[i], nrows, MPIU_INT, rprocs[i], tag, comm, rwaits + i));
5946     }
5947 
5948     /* pack the outgoing message j-array */
5949     if (nsends) k = sstarts[0];
5950     for (i = 0; i < nsends; i++) {
5951       nrows = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5952       bufJ  = bufj + sstartsj[i];
5953       for (j = 0; j < nrows; j++) {
5954         row = srow[k++] + B->rmap->range[rank]; /* global row idx */
5955         for (ll = 0; ll < sbs; ll++) {
5956           PetscCall(MatGetRow_MPIAIJ(B, row + ll, &ncols, &cols, NULL));
5957           for (l = 0; l < ncols; l++) *bufJ++ = cols[l];
5958           PetscCall(MatRestoreRow_MPIAIJ(B, row + ll, &ncols, &cols, NULL));
5959         }
5960       }
5961       PetscCallMPI(MPIU_Isend(bufj + sstartsj[i], sstartsj[i + 1] - sstartsj[i], MPIU_INT, sprocs[i], tag, comm, swaits + i));
5962     }
5963 
5964     /* recvs and sends of j-array are completed */
5965     if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5966   } else if (scall == MAT_REUSE_MATRIX) {
5967     sstartsj = *startsj_s;
5968     rstartsj = *startsj_r;
5969     bufa     = *bufa_ptr;
5970     PetscCall(MatSeqAIJGetArrayWrite(*B_oth, &b_otha));
5971   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not possess an object container");
5972 
5973   /* a-array */
5974   /*  post receives of a-array */
5975   for (i = 0; i < nrecvs; i++) {
5976     nrows = rstartsj[i + 1] - rstartsj[i]; /* length of the msg received */
5977     PetscCallMPI(MPIU_Irecv(b_otha + rstartsj[i], nrows, MPIU_SCALAR, rprocs[i], tag, comm, rwaits + i));
5978   }
5979 
5980   /* pack the outgoing message a-array */
5981   if (nsends) k = sstarts[0];
5982   for (i = 0; i < nsends; i++) {
5983     nrows = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5984     bufA  = bufa + sstartsj[i];
5985     for (j = 0; j < nrows; j++) {
5986       row = srow[k++] + B->rmap->range[rank]; /* global row idx */
5987       for (ll = 0; ll < sbs; ll++) {
5988         PetscCall(MatGetRow_MPIAIJ(B, row + ll, &ncols, NULL, &vals));
5989         for (l = 0; l < ncols; l++) *bufA++ = vals[l];
5990         PetscCall(MatRestoreRow_MPIAIJ(B, row + ll, &ncols, NULL, &vals));
5991       }
5992     }
5993     PetscCallMPI(MPIU_Isend(bufa + sstartsj[i], sstartsj[i + 1] - sstartsj[i], MPIU_SCALAR, sprocs[i], tag, comm, swaits + i));
5994   }
5995   /* recvs and sends of a-array are completed */
5996   if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5997   PetscCall(PetscFree(reqs));
5998 
5999   if (scall == MAT_INITIAL_MATRIX) {
6000     Mat_SeqAIJ *b_oth;
6001 
6002     /* put together the new matrix */
6003     PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, aBn, B->cmap->N, b_othi, b_othj, b_otha, B_oth));
6004 
6005     /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6006     /* Since these are PETSc arrays, change flags to free them as necessary. */
6007     b_oth          = (Mat_SeqAIJ *)(*B_oth)->data;
6008     b_oth->free_a  = PETSC_TRUE;
6009     b_oth->free_ij = PETSC_TRUE;
6010     b_oth->nonew   = 0;
6011 
6012     PetscCall(PetscFree(bufj));
6013     if (!startsj_s || !bufa_ptr) {
6014       PetscCall(PetscFree2(sstartsj, rstartsj));
6015       PetscCall(PetscFree(bufa_ptr));
6016     } else {
6017       *startsj_s = sstartsj;
6018       *startsj_r = rstartsj;
6019       *bufa_ptr  = bufa;
6020     }
6021   } else if (scall == MAT_REUSE_MATRIX) {
6022     PetscCall(MatSeqAIJRestoreArrayWrite(*B_oth, &b_otha));
6023   }
6024 
6025   PetscCall(VecScatterRestoreRemote_Private(ctx, PETSC_TRUE, &nsends, &sstarts, &srow, &sprocs, &sbs));
6026   PetscCall(VecScatterRestoreRemoteOrdered_Private(ctx, PETSC_FALSE, &nrecvs, &rstarts, NULL, &rprocs, &rbs));
6027   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAocols, A, B, 0, 0));
6028   PetscFunctionReturn(PETSC_SUCCESS);
6029 }
6030 
6031 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCRL(Mat, MatType, MatReuse, Mat *);
6032 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJPERM(Mat, MatType, MatReuse, Mat *);
6033 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJSELL(Mat, MatType, MatReuse, Mat *);
6034 #if defined(PETSC_HAVE_MKL_SPARSE)
6035 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJMKL(Mat, MatType, MatReuse, Mat *);
6036 #endif
6037 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIBAIJ(Mat, MatType, MatReuse, Mat *);
6038 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPISBAIJ(Mat, MatType, MatReuse, Mat *);
6039 #if defined(PETSC_HAVE_ELEMENTAL)
6040 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_Elemental(Mat, MatType, MatReuse, Mat *);
6041 #endif
6042 #if defined(PETSC_HAVE_SCALAPACK)
6043 PETSC_INTERN PetscErrorCode MatConvert_AIJ_ScaLAPACK(Mat, MatType, MatReuse, Mat *);
6044 #endif
6045 #if defined(PETSC_HAVE_HYPRE)
6046 PETSC_INTERN PetscErrorCode MatConvert_AIJ_HYPRE(Mat, MatType, MatReuse, Mat *);
6047 #endif
6048 #if defined(PETSC_HAVE_CUDA)
6049 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
6050 #endif
6051 #if defined(PETSC_HAVE_HIP)
6052 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
6053 #endif
6054 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
6055 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat, MatType, MatReuse, Mat *);
6056 #endif
6057 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPISELL(Mat, MatType, MatReuse, Mat *);
6058 PETSC_INTERN PetscErrorCode MatConvert_XAIJ_IS(Mat, MatType, MatReuse, Mat *);
6059 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_IS_XAIJ(Mat);
6060 
6061 /*
6062     Computes (B'*A')' since computing B*A directly is untenable
6063 
6064                n                       p                          p
6065         [             ]       [             ]         [                 ]
6066       m [      A      ]  *  n [       B     ]   =   m [         C       ]
6067         [             ]       [             ]         [                 ]
6068 
6069 */
6070 static PetscErrorCode MatMatMultNumeric_MPIDense_MPIAIJ(Mat A, Mat B, Mat C)
6071 {
6072   Mat At, Bt, Ct;
6073 
6074   PetscFunctionBegin;
6075   PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
6076   PetscCall(MatTranspose(B, MAT_INITIAL_MATRIX, &Bt));
6077   PetscCall(MatMatMult(Bt, At, MAT_INITIAL_MATRIX, PETSC_CURRENT, &Ct));
6078   PetscCall(MatDestroy(&At));
6079   PetscCall(MatDestroy(&Bt));
6080   PetscCall(MatTransposeSetPrecursor(Ct, C));
6081   PetscCall(MatTranspose(Ct, MAT_REUSE_MATRIX, &C));
6082   PetscCall(MatDestroy(&Ct));
6083   PetscFunctionReturn(PETSC_SUCCESS);
6084 }
6085 
6086 static PetscErrorCode MatMatMultSymbolic_MPIDense_MPIAIJ(Mat A, Mat B, PetscReal fill, Mat C)
6087 {
6088   PetscBool cisdense;
6089 
6090   PetscFunctionBegin;
6091   PetscCheck(A->cmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "A->cmap->n %" PetscInt_FMT " != B->rmap->n %" PetscInt_FMT, A->cmap->n, B->rmap->n);
6092   PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
6093   PetscCall(MatSetBlockSizesFromMats(C, A, B));
6094   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATMPIDENSE, MATMPIDENSECUDA, MATMPIDENSEHIP, ""));
6095   if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
6096   PetscCall(MatSetUp(C));
6097 
6098   C->ops->matmultnumeric = MatMatMultNumeric_MPIDense_MPIAIJ;
6099   PetscFunctionReturn(PETSC_SUCCESS);
6100 }
6101 
6102 static PetscErrorCode MatProductSetFromOptions_MPIDense_MPIAIJ_AB(Mat C)
6103 {
6104   Mat_Product *product = C->product;
6105   Mat          A = product->A, B = product->B;
6106 
6107   PetscFunctionBegin;
6108   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
6109              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
6110   C->ops->matmultsymbolic = MatMatMultSymbolic_MPIDense_MPIAIJ;
6111   C->ops->productsymbolic = MatProductSymbolic_AB;
6112   PetscFunctionReturn(PETSC_SUCCESS);
6113 }
6114 
6115 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIDense_MPIAIJ(Mat C)
6116 {
6117   Mat_Product *product = C->product;
6118 
6119   PetscFunctionBegin;
6120   if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_MPIDense_MPIAIJ_AB(C));
6121   PetscFunctionReturn(PETSC_SUCCESS);
6122 }
6123 
6124 /*
6125    Merge two sets of sorted nonzeros and return a CSR for the merged (sequential) matrix
6126 
6127   Input Parameters:
6128 
6129     j1,rowBegin1,rowEnd1,jmap1: describe the first set of nonzeros (Set1)
6130     j2,rowBegin2,rowEnd2,jmap2: describe the second set of nonzeros (Set2)
6131 
6132     mat: both sets' nonzeros are on m rows, where m is the number of local rows of the matrix mat
6133 
6134     For Set1, j1[] contains column indices of the nonzeros.
6135     For the k-th row (0<=k<m), [rowBegin1[k],rowEnd1[k]) index into j1[] and point to the begin/end nonzero in row k
6136     respectively (note rowEnd1[k] is not necessarily equal to rwoBegin1[k+1]). Indices in this range of j1[] are sorted,
6137     but might have repeats. jmap1[t+1] - jmap1[t] is the number of repeats for the t-th unique nonzero in Set1.
6138 
6139     Similar for Set2.
6140 
6141     This routine merges the two sets of nonzeros row by row and removes repeats.
6142 
6143   Output Parameters: (memory is allocated by the caller)
6144 
6145     i[],j[]: the CSR of the merged matrix, which has m rows.
6146     imap1[]: the k-th unique nonzero in Set1 (k=0,1,...) corresponds to imap1[k]-th unique nonzero in the merged matrix.
6147     imap2[]: similar to imap1[], but for Set2.
6148     Note we order nonzeros row-by-row and from left to right.
6149 */
6150 static PetscErrorCode MatMergeEntries_Internal(Mat mat, const PetscInt j1[], const PetscInt j2[], const PetscCount rowBegin1[], const PetscCount rowEnd1[], const PetscCount rowBegin2[], const PetscCount rowEnd2[], const PetscCount jmap1[], const PetscCount jmap2[], PetscCount imap1[], PetscCount imap2[], PetscInt i[], PetscInt j[])
6151 {
6152   PetscInt   r, m; /* Row index of mat */
6153   PetscCount t, t1, t2, b1, e1, b2, e2;
6154 
6155   PetscFunctionBegin;
6156   PetscCall(MatGetLocalSize(mat, &m, NULL));
6157   t1 = t2 = t = 0; /* Count unique nonzeros of in Set1, Set1 and the merged respectively */
6158   i[0]        = 0;
6159   for (r = 0; r < m; r++) { /* Do row by row merging */
6160     b1 = rowBegin1[r];
6161     e1 = rowEnd1[r];
6162     b2 = rowBegin2[r];
6163     e2 = rowEnd2[r];
6164     while (b1 < e1 && b2 < e2) {
6165       if (j1[b1] == j2[b2]) { /* Same column index and hence same nonzero */
6166         j[t]      = j1[b1];
6167         imap1[t1] = t;
6168         imap2[t2] = t;
6169         b1 += jmap1[t1 + 1] - jmap1[t1]; /* Jump to next unique local nonzero */
6170         b2 += jmap2[t2 + 1] - jmap2[t2]; /* Jump to next unique remote nonzero */
6171         t1++;
6172         t2++;
6173         t++;
6174       } else if (j1[b1] < j2[b2]) {
6175         j[t]      = j1[b1];
6176         imap1[t1] = t;
6177         b1 += jmap1[t1 + 1] - jmap1[t1];
6178         t1++;
6179         t++;
6180       } else {
6181         j[t]      = j2[b2];
6182         imap2[t2] = t;
6183         b2 += jmap2[t2 + 1] - jmap2[t2];
6184         t2++;
6185         t++;
6186       }
6187     }
6188     /* Merge the remaining in either j1[] or j2[] */
6189     while (b1 < e1) {
6190       j[t]      = j1[b1];
6191       imap1[t1] = t;
6192       b1 += jmap1[t1 + 1] - jmap1[t1];
6193       t1++;
6194       t++;
6195     }
6196     while (b2 < e2) {
6197       j[t]      = j2[b2];
6198       imap2[t2] = t;
6199       b2 += jmap2[t2 + 1] - jmap2[t2];
6200       t2++;
6201       t++;
6202     }
6203     PetscCall(PetscIntCast(t, i + r + 1));
6204   }
6205   PetscFunctionReturn(PETSC_SUCCESS);
6206 }
6207 
6208 /*
6209   Split nonzeros in a block of local rows into two subsets: those in the diagonal block and those in the off-diagonal block
6210 
6211   Input Parameters:
6212     mat: an MPI matrix that provides row and column layout information for splitting. Let's say its number of local rows is m.
6213     n,i[],j[],perm[]: there are n input entries, belonging to m rows. Row/col indices of the entries are stored in i[] and j[]
6214       respectively, along with a permutation array perm[]. Length of the i[],j[],perm[] arrays is n.
6215 
6216       i[] is already sorted, but within a row, j[] is not sorted and might have repeats.
6217       i[] might contain negative indices at the beginning, which means the corresponding entries should be ignored in the splitting.
6218 
6219   Output Parameters:
6220     j[],perm[]: the routine needs to sort j[] within each row along with perm[].
6221     rowBegin[],rowMid[],rowEnd[]: of length m, and the memory is preallocated and zeroed by the caller.
6222       They contain indices pointing to j[]. For 0<=r<m, [rowBegin[r],rowMid[r]) point to begin/end entries of row r of the diagonal block,
6223       and [rowMid[r],rowEnd[r]) point to begin/end entries of row r of the off-diagonal block.
6224 
6225     Aperm[],Ajmap[],Atot,Annz: Arrays are allocated by this routine.
6226       Atot: number of entries belonging to the diagonal block.
6227       Annz: number of unique nonzeros belonging to the diagonal block.
6228       Aperm[Atot] stores values from perm[] for entries belonging to the diagonal block. Length of Aperm[] is Atot, though it may also count
6229         repeats (i.e., same 'i,j' pair).
6230       Ajmap[Annz+1] stores the number of repeats of each unique entry belonging to the diagonal block. More precisely, Ajmap[t+1] - Ajmap[t]
6231         is the number of repeats for the t-th unique entry in the diagonal block. Ajmap[0] is always 0.
6232 
6233       Atot: number of entries belonging to the diagonal block
6234       Annz: number of unique nonzeros belonging to the diagonal block.
6235 
6236     Bperm[], Bjmap[], Btot, Bnnz are similar but for the off-diagonal block.
6237 
6238     Aperm[],Bperm[],Ajmap[] and Bjmap[] are allocated separately by this routine with PetscMalloc1().
6239 */
6240 static PetscErrorCode MatSplitEntries_Internal(Mat mat, PetscCount n, const PetscInt i[], PetscInt j[], PetscCount perm[], PetscCount rowBegin[], PetscCount rowMid[], PetscCount rowEnd[], PetscCount *Atot_, PetscCount **Aperm_, PetscCount *Annz_, PetscCount **Ajmap_, PetscCount *Btot_, PetscCount **Bperm_, PetscCount *Bnnz_, PetscCount **Bjmap_)
6241 {
6242   PetscInt    cstart, cend, rstart, rend, row, col;
6243   PetscCount  Atot = 0, Btot = 0; /* Total number of nonzeros in the diagonal and off-diagonal blocks */
6244   PetscCount  Annz = 0, Bnnz = 0; /* Number of unique nonzeros in the diagonal and off-diagonal blocks */
6245   PetscCount  k, m, p, q, r, s, mid;
6246   PetscCount *Aperm, *Bperm, *Ajmap, *Bjmap;
6247 
6248   PetscFunctionBegin;
6249   PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
6250   PetscCall(PetscLayoutGetRange(mat->cmap, &cstart, &cend));
6251   m = rend - rstart;
6252 
6253   /* Skip negative rows */
6254   for (k = 0; k < n; k++)
6255     if (i[k] >= 0) break;
6256 
6257   /* Process [k,n): sort and partition each local row into diag and offdiag portions,
6258      fill rowBegin[], rowMid[], rowEnd[], and count Atot, Btot, Annz, Bnnz.
6259   */
6260   while (k < n) {
6261     row = i[k];
6262     /* Entries in [k,s) are in one row. Shift diagonal block col indices so that diag is ahead of offdiag after sorting the row */
6263     for (s = k; s < n; s++)
6264       if (i[s] != row) break;
6265 
6266     /* Shift diag columns to range of [-PETSC_INT_MAX, -1] */
6267     for (p = k; p < s; p++) {
6268       if (j[p] >= cstart && j[p] < cend) j[p] -= PETSC_INT_MAX;
6269       else PetscAssert((j[p] >= 0) && (j[p] <= mat->cmap->N), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column index %" PetscInt_FMT " is out of range", j[p]);
6270     }
6271     PetscCall(PetscSortIntWithCountArray(s - k, j + k, perm + k));
6272     PetscCall(PetscSortedIntUpperBound(j, k, s, -1, &mid)); /* Separate [k,s) into [k,mid) for diag and [mid,s) for offdiag */
6273     rowBegin[row - rstart] = k;
6274     rowMid[row - rstart]   = mid;
6275     rowEnd[row - rstart]   = s;
6276 
6277     /* Count nonzeros of this diag/offdiag row, which might have repeats */
6278     Atot += mid - k;
6279     Btot += s - mid;
6280 
6281     /* Count unique nonzeros of this diag row */
6282     for (p = k; p < mid;) {
6283       col = j[p];
6284       do {
6285         j[p] += PETSC_INT_MAX; /* Revert the modified diagonal indices */
6286         p++;
6287       } while (p < mid && j[p] == col);
6288       Annz++;
6289     }
6290 
6291     /* Count unique nonzeros of this offdiag row */
6292     for (p = mid; p < s;) {
6293       col = j[p];
6294       do {
6295         p++;
6296       } while (p < s && j[p] == col);
6297       Bnnz++;
6298     }
6299     k = s;
6300   }
6301 
6302   /* Allocation according to Atot, Btot, Annz, Bnnz */
6303   PetscCall(PetscMalloc1(Atot, &Aperm));
6304   PetscCall(PetscMalloc1(Btot, &Bperm));
6305   PetscCall(PetscMalloc1(Annz + 1, &Ajmap));
6306   PetscCall(PetscMalloc1(Bnnz + 1, &Bjmap));
6307 
6308   /* Re-scan indices and copy diag/offdiag permutation indices to Aperm, Bperm and also fill Ajmap and Bjmap */
6309   Ajmap[0] = Bjmap[0] = Atot = Btot = Annz = Bnnz = 0;
6310   for (r = 0; r < m; r++) {
6311     k   = rowBegin[r];
6312     mid = rowMid[r];
6313     s   = rowEnd[r];
6314     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(Aperm, Atot), PetscSafePointerPlusOffset(perm, k), mid - k));
6315     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(Bperm, Btot), PetscSafePointerPlusOffset(perm, mid), s - mid));
6316     Atot += mid - k;
6317     Btot += s - mid;
6318 
6319     /* Scan column indices in this row and find out how many repeats each unique nonzero has */
6320     for (p = k; p < mid;) {
6321       col = j[p];
6322       q   = p;
6323       do {
6324         p++;
6325       } while (p < mid && j[p] == col);
6326       Ajmap[Annz + 1] = Ajmap[Annz] + (p - q);
6327       Annz++;
6328     }
6329 
6330     for (p = mid; p < s;) {
6331       col = j[p];
6332       q   = p;
6333       do {
6334         p++;
6335       } while (p < s && j[p] == col);
6336       Bjmap[Bnnz + 1] = Bjmap[Bnnz] + (p - q);
6337       Bnnz++;
6338     }
6339   }
6340   /* Output */
6341   *Aperm_ = Aperm;
6342   *Annz_  = Annz;
6343   *Atot_  = Atot;
6344   *Ajmap_ = Ajmap;
6345   *Bperm_ = Bperm;
6346   *Bnnz_  = Bnnz;
6347   *Btot_  = Btot;
6348   *Bjmap_ = Bjmap;
6349   PetscFunctionReturn(PETSC_SUCCESS);
6350 }
6351 
6352 /*
6353   Expand the jmap[] array to make a new one in view of nonzeros in the merged matrix
6354 
6355   Input Parameters:
6356     nnz1: number of unique nonzeros in a set that was used to produce imap[], jmap[]
6357     nnz:  number of unique nonzeros in the merged matrix
6358     imap[nnz1]: i-th nonzero in the set is the imap[i]-th nonzero in the merged matrix
6359     jmap[nnz1+1]: i-th nonzero in the set has jmap[i+1] - jmap[i] repeats in the set
6360 
6361   Output Parameter: (memory is allocated by the caller)
6362     jmap_new[nnz+1]: i-th nonzero in the merged matrix has jmap_new[i+1] - jmap_new[i] repeats in the set
6363 
6364   Example:
6365     nnz1 = 4
6366     nnz  = 6
6367     imap = [1,3,4,5]
6368     jmap = [0,3,5,6,7]
6369    then,
6370     jmap_new = [0,0,3,3,5,6,7]
6371 */
6372 static PetscErrorCode ExpandJmap_Internal(PetscCount nnz1, PetscCount nnz, const PetscCount imap[], const PetscCount jmap[], PetscCount jmap_new[])
6373 {
6374   PetscCount k, p;
6375 
6376   PetscFunctionBegin;
6377   jmap_new[0] = 0;
6378   p           = nnz;                /* p loops over jmap_new[] backwards */
6379   for (k = nnz1 - 1; k >= 0; k--) { /* k loops over imap[] */
6380     for (; p > imap[k]; p--) jmap_new[p] = jmap[k + 1];
6381   }
6382   for (; p >= 0; p--) jmap_new[p] = jmap[0];
6383   PetscFunctionReturn(PETSC_SUCCESS);
6384 }
6385 
6386 static PetscErrorCode MatCOOStructDestroy_MPIAIJ(void **data)
6387 {
6388   MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)*data;
6389 
6390   PetscFunctionBegin;
6391   PetscCall(PetscSFDestroy(&coo->sf));
6392   PetscCall(PetscFree(coo->Aperm1));
6393   PetscCall(PetscFree(coo->Bperm1));
6394   PetscCall(PetscFree(coo->Ajmap1));
6395   PetscCall(PetscFree(coo->Bjmap1));
6396   PetscCall(PetscFree(coo->Aimap2));
6397   PetscCall(PetscFree(coo->Bimap2));
6398   PetscCall(PetscFree(coo->Aperm2));
6399   PetscCall(PetscFree(coo->Bperm2));
6400   PetscCall(PetscFree(coo->Ajmap2));
6401   PetscCall(PetscFree(coo->Bjmap2));
6402   PetscCall(PetscFree(coo->Cperm1));
6403   PetscCall(PetscFree2(coo->sendbuf, coo->recvbuf));
6404   PetscCall(PetscFree(coo));
6405   PetscFunctionReturn(PETSC_SUCCESS);
6406 }
6407 
6408 PetscErrorCode MatSetPreallocationCOO_MPIAIJ(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
6409 {
6410   MPI_Comm             comm;
6411   PetscMPIInt          rank, size;
6412   PetscInt             m, n, M, N, rstart, rend, cstart, cend; /* Sizes, indices of row/col, therefore with type PetscInt */
6413   PetscCount           k, p, q, rem;                           /* Loop variables over coo arrays */
6414   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
6415   PetscContainer       container;
6416   MatCOOStruct_MPIAIJ *coo;
6417 
6418   PetscFunctionBegin;
6419   PetscCall(PetscFree(mpiaij->garray));
6420   PetscCall(VecDestroy(&mpiaij->lvec));
6421 #if defined(PETSC_USE_CTABLE)
6422   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
6423 #else
6424   PetscCall(PetscFree(mpiaij->colmap));
6425 #endif
6426   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
6427   mat->assembled     = PETSC_FALSE;
6428   mat->was_assembled = PETSC_FALSE;
6429 
6430   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
6431   PetscCallMPI(MPI_Comm_size(comm, &size));
6432   PetscCallMPI(MPI_Comm_rank(comm, &rank));
6433   PetscCall(PetscLayoutSetUp(mat->rmap));
6434   PetscCall(PetscLayoutSetUp(mat->cmap));
6435   PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
6436   PetscCall(PetscLayoutGetRange(mat->cmap, &cstart, &cend));
6437   PetscCall(MatGetLocalSize(mat, &m, &n));
6438   PetscCall(MatGetSize(mat, &M, &N));
6439 
6440   /* Sort (i,j) by row along with a permutation array, so that the to-be-ignored */
6441   /* entries come first, then local rows, then remote rows.                     */
6442   PetscCount n1 = coo_n, *perm1;
6443   PetscInt  *i1 = coo_i, *j1 = coo_j;
6444 
6445   PetscCall(PetscMalloc1(n1, &perm1));
6446   for (k = 0; k < n1; k++) perm1[k] = k;
6447 
6448   /* Manipulate indices so that entries with negative row or col indices will have smallest
6449      row indices, local entries will have greater but negative row indices, and remote entries
6450      will have positive row indices.
6451   */
6452   for (k = 0; k < n1; k++) {
6453     if (i1[k] < 0 || j1[k] < 0) i1[k] = PETSC_INT_MIN;                /* e.g., -2^31, minimal to move them ahead */
6454     else if (i1[k] >= rstart && i1[k] < rend) i1[k] -= PETSC_INT_MAX; /* e.g., minus 2^31-1 to shift local rows to range of [-PETSC_INT_MAX, -1] */
6455     else {
6456       PetscCheck(!mat->nooffprocentries, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "MAT_NO_OFF_PROC_ENTRIES is set but insert to remote rows");
6457       if (mpiaij->donotstash) i1[k] = PETSC_INT_MIN; /* Ignore offproc entries as if they had negative indices */
6458     }
6459   }
6460 
6461   /* Sort by row; after that, [0,k) have ignored entries, [k,rem) have local rows and [rem,n1) have remote rows */
6462   PetscCall(PetscSortIntWithIntCountArrayPair(n1, i1, j1, perm1));
6463 
6464   /* Advance k to the first entry we need to take care of */
6465   for (k = 0; k < n1; k++)
6466     if (i1[k] > PETSC_INT_MIN) break;
6467   PetscCount i1start = k;
6468 
6469   PetscCall(PetscSortedIntUpperBound(i1, k, n1, rend - 1 - PETSC_INT_MAX, &rem)); /* rem is upper bound of the last local row */
6470   for (; k < rem; k++) i1[k] += PETSC_INT_MAX;                                    /* Revert row indices of local rows*/
6471 
6472   /*           Send remote rows to their owner                                  */
6473   /* Find which rows should be sent to which remote ranks*/
6474   PetscInt        nsend = 0; /* Number of MPI ranks to send data to */
6475   PetscMPIInt    *sendto;    /* [nsend], storing remote ranks */
6476   PetscInt       *nentries;  /* [nsend], storing number of entries sent to remote ranks; Assume PetscInt is big enough for this count, and error if not */
6477   const PetscInt *ranges;
6478   PetscInt        maxNsend = size >= 128 ? 128 : size; /* Assume max 128 neighbors; realloc when needed */
6479 
6480   PetscCall(PetscLayoutGetRanges(mat->rmap, &ranges));
6481   PetscCall(PetscMalloc2(maxNsend, &sendto, maxNsend, &nentries));
6482   for (k = rem; k < n1;) {
6483     PetscMPIInt owner;
6484     PetscInt    firstRow, lastRow;
6485 
6486     /* Locate a row range */
6487     firstRow = i1[k]; /* first row of this owner */
6488     PetscCall(PetscLayoutFindOwner(mat->rmap, firstRow, &owner));
6489     lastRow = ranges[owner + 1] - 1; /* last row of this owner */
6490 
6491     /* Find the first index 'p' in [k,n) with i[p] belonging to next owner */
6492     PetscCall(PetscSortedIntUpperBound(i1, k, n1, lastRow, &p));
6493 
6494     /* All entries in [k,p) belong to this remote owner */
6495     if (nsend >= maxNsend) { /* Double the remote ranks arrays if not long enough */
6496       PetscMPIInt *sendto2;
6497       PetscInt    *nentries2;
6498       PetscInt     maxNsend2 = (maxNsend <= size / 2) ? maxNsend * 2 : size;
6499 
6500       PetscCall(PetscMalloc2(maxNsend2, &sendto2, maxNsend2, &nentries2));
6501       PetscCall(PetscArraycpy(sendto2, sendto, maxNsend));
6502       PetscCall(PetscArraycpy(nentries2, nentries2, maxNsend + 1));
6503       PetscCall(PetscFree2(sendto, nentries2));
6504       sendto   = sendto2;
6505       nentries = nentries2;
6506       maxNsend = maxNsend2;
6507     }
6508     sendto[nsend] = owner;
6509     PetscCall(PetscIntCast(p - k, &nentries[nsend]));
6510     nsend++;
6511     k = p;
6512   }
6513 
6514   /* Build 1st SF to know offsets on remote to send data */
6515   PetscSF      sf1;
6516   PetscInt     nroots = 1, nroots2 = 0;
6517   PetscInt     nleaves = nsend, nleaves2 = 0;
6518   PetscInt    *offsets;
6519   PetscSFNode *iremote;
6520 
6521   PetscCall(PetscSFCreate(comm, &sf1));
6522   PetscCall(PetscMalloc1(nsend, &iremote));
6523   PetscCall(PetscMalloc1(nsend, &offsets));
6524   for (k = 0; k < nsend; k++) {
6525     iremote[k].rank  = sendto[k];
6526     iremote[k].index = 0;
6527     nleaves2 += nentries[k];
6528     PetscCheck(nleaves2 >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Number of SF leaves is too large for PetscInt");
6529   }
6530   PetscCall(PetscSFSetGraph(sf1, nroots, nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
6531   PetscCall(PetscSFFetchAndOpWithMemTypeBegin(sf1, MPIU_INT, PETSC_MEMTYPE_HOST, &nroots2 /*rootdata*/, PETSC_MEMTYPE_HOST, nentries /*leafdata*/, PETSC_MEMTYPE_HOST, offsets /*leafupdate*/, MPI_SUM));
6532   PetscCall(PetscSFFetchAndOpEnd(sf1, MPIU_INT, &nroots2, nentries, offsets, MPI_SUM)); /* Would nroots2 overflow, we check offsets[] below */
6533   PetscCall(PetscSFDestroy(&sf1));
6534   PetscAssert(nleaves2 == n1 - rem, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nleaves2 %" PetscInt_FMT " != number of remote entries %" PetscCount_FMT, nleaves2, n1 - rem);
6535 
6536   /* Build 2nd SF to send remote COOs to their owner */
6537   PetscSF sf2;
6538   nroots  = nroots2;
6539   nleaves = nleaves2;
6540   PetscCall(PetscSFCreate(comm, &sf2));
6541   PetscCall(PetscSFSetFromOptions(sf2));
6542   PetscCall(PetscMalloc1(nleaves, &iremote));
6543   p = 0;
6544   for (k = 0; k < nsend; k++) {
6545     PetscCheck(offsets[k] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Number of SF roots is too large for PetscInt");
6546     for (q = 0; q < nentries[k]; q++, p++) {
6547       iremote[p].rank = sendto[k];
6548       PetscCall(PetscIntCast(offsets[k] + q, &iremote[p].index));
6549     }
6550   }
6551   PetscCall(PetscSFSetGraph(sf2, nroots, nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
6552 
6553   /* Send the remote COOs to their owner */
6554   PetscInt    n2 = nroots, *i2, *j2; /* Buffers for received COOs from other ranks, along with a permutation array */
6555   PetscCount *perm2;                 /* Though PetscInt is enough for remote entries, we use PetscCount here as we want to reuse MatSplitEntries_Internal() */
6556   PetscCall(PetscMalloc3(n2, &i2, n2, &j2, n2, &perm2));
6557   PetscAssert(rem == 0 || i1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6558   PetscAssert(rem == 0 || j1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6559   PetscInt *i1prem = PetscSafePointerPlusOffset(i1, rem);
6560   PetscInt *j1prem = PetscSafePointerPlusOffset(j1, rem);
6561   PetscCall(PetscSFReduceWithMemTypeBegin(sf2, MPIU_INT, PETSC_MEMTYPE_HOST, i1prem, PETSC_MEMTYPE_HOST, i2, MPI_REPLACE));
6562   PetscCall(PetscSFReduceEnd(sf2, MPIU_INT, i1prem, i2, MPI_REPLACE));
6563   PetscCall(PetscSFReduceWithMemTypeBegin(sf2, MPIU_INT, PETSC_MEMTYPE_HOST, j1prem, PETSC_MEMTYPE_HOST, j2, MPI_REPLACE));
6564   PetscCall(PetscSFReduceEnd(sf2, MPIU_INT, j1prem, j2, MPI_REPLACE));
6565 
6566   PetscCall(PetscFree(offsets));
6567   PetscCall(PetscFree2(sendto, nentries));
6568 
6569   /* Sort received COOs by row along with the permutation array     */
6570   for (k = 0; k < n2; k++) perm2[k] = k;
6571   PetscCall(PetscSortIntWithIntCountArrayPair(n2, i2, j2, perm2));
6572 
6573   /* sf2 only sends contiguous leafdata to contiguous rootdata. We record the permutation which will be used to fill leafdata */
6574   PetscCount *Cperm1;
6575   PetscAssert(rem == 0 || perm1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6576   PetscCount *perm1prem = PetscSafePointerPlusOffset(perm1, rem);
6577   PetscCall(PetscMalloc1(nleaves, &Cperm1));
6578   PetscCall(PetscArraycpy(Cperm1, perm1prem, nleaves));
6579 
6580   /* Support for HYPRE matrices, kind of a hack.
6581      Swap min column with diagonal so that diagonal values will go first */
6582   PetscBool hypre;
6583   PetscCall(PetscStrcmp("_internal_COO_mat_for_hypre", ((PetscObject)mat)->name, &hypre));
6584   if (hypre) {
6585     PetscInt *minj;
6586     PetscBT   hasdiag;
6587 
6588     PetscCall(PetscBTCreate(m, &hasdiag));
6589     PetscCall(PetscMalloc1(m, &minj));
6590     for (k = 0; k < m; k++) minj[k] = PETSC_INT_MAX;
6591     for (k = i1start; k < rem; k++) {
6592       if (j1[k] < cstart || j1[k] >= cend) continue;
6593       const PetscInt rindex = i1[k] - rstart;
6594       if ((j1[k] - cstart) == rindex) PetscCall(PetscBTSet(hasdiag, rindex));
6595       minj[rindex] = PetscMin(minj[rindex], j1[k]);
6596     }
6597     for (k = 0; k < n2; k++) {
6598       if (j2[k] < cstart || j2[k] >= cend) continue;
6599       const PetscInt rindex = i2[k] - rstart;
6600       if ((j2[k] - cstart) == rindex) PetscCall(PetscBTSet(hasdiag, rindex));
6601       minj[rindex] = PetscMin(minj[rindex], j2[k]);
6602     }
6603     for (k = i1start; k < rem; k++) {
6604       const PetscInt rindex = i1[k] - rstart;
6605       if (j1[k] < cstart || j1[k] >= cend || !PetscBTLookup(hasdiag, rindex)) continue;
6606       if (j1[k] == minj[rindex]) j1[k] = i1[k] + (cstart - rstart);
6607       else if ((j1[k] - cstart) == rindex) j1[k] = minj[rindex];
6608     }
6609     for (k = 0; k < n2; k++) {
6610       const PetscInt rindex = i2[k] - rstart;
6611       if (j2[k] < cstart || j2[k] >= cend || !PetscBTLookup(hasdiag, rindex)) continue;
6612       if (j2[k] == minj[rindex]) j2[k] = i2[k] + (cstart - rstart);
6613       else if ((j2[k] - cstart) == rindex) j2[k] = minj[rindex];
6614     }
6615     PetscCall(PetscBTDestroy(&hasdiag));
6616     PetscCall(PetscFree(minj));
6617   }
6618 
6619   /* Split local COOs and received COOs into diag/offdiag portions */
6620   PetscCount *rowBegin1, *rowMid1, *rowEnd1;
6621   PetscCount *Ajmap1, *Aperm1, *Bjmap1, *Bperm1;
6622   PetscCount  Annz1, Bnnz1, Atot1, Btot1;
6623   PetscCount *rowBegin2, *rowMid2, *rowEnd2;
6624   PetscCount *Ajmap2, *Aperm2, *Bjmap2, *Bperm2;
6625   PetscCount  Annz2, Bnnz2, Atot2, Btot2;
6626 
6627   PetscCall(PetscCalloc3(m, &rowBegin1, m, &rowMid1, m, &rowEnd1));
6628   PetscCall(PetscCalloc3(m, &rowBegin2, m, &rowMid2, m, &rowEnd2));
6629   PetscCall(MatSplitEntries_Internal(mat, rem, i1, j1, perm1, rowBegin1, rowMid1, rowEnd1, &Atot1, &Aperm1, &Annz1, &Ajmap1, &Btot1, &Bperm1, &Bnnz1, &Bjmap1));
6630   PetscCall(MatSplitEntries_Internal(mat, n2, i2, j2, perm2, rowBegin2, rowMid2, rowEnd2, &Atot2, &Aperm2, &Annz2, &Ajmap2, &Btot2, &Bperm2, &Bnnz2, &Bjmap2));
6631 
6632   /* Merge local COOs with received COOs: diag with diag, offdiag with offdiag */
6633   PetscInt *Ai, *Bi;
6634   PetscInt *Aj, *Bj;
6635 
6636   PetscCall(PetscMalloc1(m + 1, &Ai));
6637   PetscCall(PetscMalloc1(m + 1, &Bi));
6638   PetscCall(PetscMalloc1(Annz1 + Annz2, &Aj)); /* Since local and remote entries might have dups, we might allocate excess memory */
6639   PetscCall(PetscMalloc1(Bnnz1 + Bnnz2, &Bj));
6640 
6641   PetscCount *Aimap1, *Bimap1, *Aimap2, *Bimap2;
6642   PetscCall(PetscMalloc1(Annz1, &Aimap1));
6643   PetscCall(PetscMalloc1(Bnnz1, &Bimap1));
6644   PetscCall(PetscMalloc1(Annz2, &Aimap2));
6645   PetscCall(PetscMalloc1(Bnnz2, &Bimap2));
6646 
6647   PetscCall(MatMergeEntries_Internal(mat, j1, j2, rowBegin1, rowMid1, rowBegin2, rowMid2, Ajmap1, Ajmap2, Aimap1, Aimap2, Ai, Aj));
6648   PetscCall(MatMergeEntries_Internal(mat, j1, j2, rowMid1, rowEnd1, rowMid2, rowEnd2, Bjmap1, Bjmap2, Bimap1, Bimap2, Bi, Bj));
6649 
6650   /* Expand Ajmap1/Bjmap1 to make them based off nonzeros in A/B, since we     */
6651   /* expect nonzeros in A/B most likely have local contributing entries        */
6652   PetscInt    Annz = Ai[m];
6653   PetscInt    Bnnz = Bi[m];
6654   PetscCount *Ajmap1_new, *Bjmap1_new;
6655 
6656   PetscCall(PetscMalloc1(Annz + 1, &Ajmap1_new));
6657   PetscCall(PetscMalloc1(Bnnz + 1, &Bjmap1_new));
6658 
6659   PetscCall(ExpandJmap_Internal(Annz1, Annz, Aimap1, Ajmap1, Ajmap1_new));
6660   PetscCall(ExpandJmap_Internal(Bnnz1, Bnnz, Bimap1, Bjmap1, Bjmap1_new));
6661 
6662   PetscCall(PetscFree(Aimap1));
6663   PetscCall(PetscFree(Ajmap1));
6664   PetscCall(PetscFree(Bimap1));
6665   PetscCall(PetscFree(Bjmap1));
6666   PetscCall(PetscFree3(rowBegin1, rowMid1, rowEnd1));
6667   PetscCall(PetscFree3(rowBegin2, rowMid2, rowEnd2));
6668   PetscCall(PetscFree(perm1));
6669   PetscCall(PetscFree3(i2, j2, perm2));
6670 
6671   Ajmap1 = Ajmap1_new;
6672   Bjmap1 = Bjmap1_new;
6673 
6674   /* Reallocate Aj, Bj once we know actual numbers of unique nonzeros in A and B */
6675   if (Annz < Annz1 + Annz2) {
6676     PetscInt *Aj_new;
6677     PetscCall(PetscMalloc1(Annz, &Aj_new));
6678     PetscCall(PetscArraycpy(Aj_new, Aj, Annz));
6679     PetscCall(PetscFree(Aj));
6680     Aj = Aj_new;
6681   }
6682 
6683   if (Bnnz < Bnnz1 + Bnnz2) {
6684     PetscInt *Bj_new;
6685     PetscCall(PetscMalloc1(Bnnz, &Bj_new));
6686     PetscCall(PetscArraycpy(Bj_new, Bj, Bnnz));
6687     PetscCall(PetscFree(Bj));
6688     Bj = Bj_new;
6689   }
6690 
6691   /* Create new submatrices for on-process and off-process coupling                  */
6692   PetscScalar     *Aa, *Ba;
6693   MatType          rtype;
6694   Mat_SeqAIJ      *a, *b;
6695   PetscObjectState state;
6696   PetscCall(PetscCalloc1(Annz, &Aa)); /* Zero matrix on device */
6697   PetscCall(PetscCalloc1(Bnnz, &Ba));
6698   /* make Aj[] local, i.e, based off the start column of the diagonal portion */
6699   if (cstart) {
6700     for (k = 0; k < Annz; k++) Aj[k] -= cstart;
6701   }
6702 
6703   PetscCall(MatGetRootType_Private(mat, &rtype));
6704 
6705   MatSeqXAIJGetOptions_Private(mpiaij->A);
6706   PetscCall(MatDestroy(&mpiaij->A));
6707   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, n, Ai, Aj, Aa, &mpiaij->A));
6708   PetscCall(MatSetBlockSizesFromMats(mpiaij->A, mat, mat));
6709   MatSeqXAIJRestoreOptions_Private(mpiaij->A);
6710 
6711   MatSeqXAIJGetOptions_Private(mpiaij->B);
6712   PetscCall(MatDestroy(&mpiaij->B));
6713   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, mat->cmap->N, Bi, Bj, Ba, &mpiaij->B));
6714   PetscCall(MatSetBlockSizesFromMats(mpiaij->B, mat, mat));
6715   MatSeqXAIJRestoreOptions_Private(mpiaij->B);
6716 
6717   PetscCall(MatSetUpMultiply_MPIAIJ(mat));
6718   mat->was_assembled = PETSC_TRUE; // was_assembled in effect means the Mvctx is built; doing so avoids redundant MatSetUpMultiply_MPIAIJ
6719   state              = mpiaij->A->nonzerostate + mpiaij->B->nonzerostate;
6720   PetscCallMPI(MPIU_Allreduce(&state, &mat->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)mat)));
6721 
6722   a          = (Mat_SeqAIJ *)mpiaij->A->data;
6723   b          = (Mat_SeqAIJ *)mpiaij->B->data;
6724   a->free_a  = PETSC_TRUE;
6725   a->free_ij = PETSC_TRUE;
6726   b->free_a  = PETSC_TRUE;
6727   b->free_ij = PETSC_TRUE;
6728   a->maxnz   = a->nz;
6729   b->maxnz   = b->nz;
6730 
6731   /* conversion must happen AFTER multiply setup */
6732   PetscCall(MatConvert(mpiaij->A, rtype, MAT_INPLACE_MATRIX, &mpiaij->A));
6733   PetscCall(MatConvert(mpiaij->B, rtype, MAT_INPLACE_MATRIX, &mpiaij->B));
6734   PetscCall(VecDestroy(&mpiaij->lvec));
6735   PetscCall(MatCreateVecs(mpiaij->B, &mpiaij->lvec, NULL));
6736 
6737   // Put the COO struct in a container and then attach that to the matrix
6738   PetscCall(PetscMalloc1(1, &coo));
6739   coo->n       = coo_n;
6740   coo->sf      = sf2;
6741   coo->sendlen = nleaves;
6742   coo->recvlen = nroots;
6743   coo->Annz    = Annz;
6744   coo->Bnnz    = Bnnz;
6745   coo->Annz2   = Annz2;
6746   coo->Bnnz2   = Bnnz2;
6747   coo->Atot1   = Atot1;
6748   coo->Atot2   = Atot2;
6749   coo->Btot1   = Btot1;
6750   coo->Btot2   = Btot2;
6751   coo->Ajmap1  = Ajmap1;
6752   coo->Aperm1  = Aperm1;
6753   coo->Bjmap1  = Bjmap1;
6754   coo->Bperm1  = Bperm1;
6755   coo->Aimap2  = Aimap2;
6756   coo->Ajmap2  = Ajmap2;
6757   coo->Aperm2  = Aperm2;
6758   coo->Bimap2  = Bimap2;
6759   coo->Bjmap2  = Bjmap2;
6760   coo->Bperm2  = Bperm2;
6761   coo->Cperm1  = Cperm1;
6762   // Allocate in preallocation. If not used, it has zero cost on host
6763   PetscCall(PetscMalloc2(coo->sendlen, &coo->sendbuf, coo->recvlen, &coo->recvbuf));
6764   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container));
6765   PetscCall(PetscContainerSetPointer(container, coo));
6766   PetscCall(PetscContainerSetCtxDestroy(container, MatCOOStructDestroy_MPIAIJ));
6767   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject)container));
6768   PetscCall(PetscContainerDestroy(&container));
6769   PetscFunctionReturn(PETSC_SUCCESS);
6770 }
6771 
6772 static PetscErrorCode MatSetValuesCOO_MPIAIJ(Mat mat, const PetscScalar v[], InsertMode imode)
6773 {
6774   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
6775   Mat                  A = mpiaij->A, B = mpiaij->B;
6776   PetscScalar         *Aa, *Ba;
6777   PetscScalar         *sendbuf, *recvbuf;
6778   const PetscCount    *Ajmap1, *Ajmap2, *Aimap2;
6779   const PetscCount    *Bjmap1, *Bjmap2, *Bimap2;
6780   const PetscCount    *Aperm1, *Aperm2, *Bperm1, *Bperm2;
6781   const PetscCount    *Cperm1;
6782   PetscContainer       container;
6783   MatCOOStruct_MPIAIJ *coo;
6784 
6785   PetscFunctionBegin;
6786   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container));
6787   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
6788   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
6789   sendbuf = coo->sendbuf;
6790   recvbuf = coo->recvbuf;
6791   Ajmap1  = coo->Ajmap1;
6792   Ajmap2  = coo->Ajmap2;
6793   Aimap2  = coo->Aimap2;
6794   Bjmap1  = coo->Bjmap1;
6795   Bjmap2  = coo->Bjmap2;
6796   Bimap2  = coo->Bimap2;
6797   Aperm1  = coo->Aperm1;
6798   Aperm2  = coo->Aperm2;
6799   Bperm1  = coo->Bperm1;
6800   Bperm2  = coo->Bperm2;
6801   Cperm1  = coo->Cperm1;
6802 
6803   PetscCall(MatSeqAIJGetArray(A, &Aa)); /* Might read and write matrix values */
6804   PetscCall(MatSeqAIJGetArray(B, &Ba));
6805 
6806   /* Pack entries to be sent to remote */
6807   for (PetscCount i = 0; i < coo->sendlen; i++) sendbuf[i] = v[Cperm1[i]];
6808 
6809   /* Send remote entries to their owner and overlap the communication with local computation */
6810   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_HOST, sendbuf, PETSC_MEMTYPE_HOST, recvbuf, MPI_REPLACE));
6811   /* Add local entries to A and B */
6812   for (PetscCount i = 0; i < coo->Annz; i++) { /* All nonzeros in A are either zero'ed or added with a value (i.e., initialized) */
6813     PetscScalar sum = 0.0;                     /* Do partial summation first to improve numerical stability */
6814     for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += v[Aperm1[k]];
6815     Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
6816   }
6817   for (PetscCount i = 0; i < coo->Bnnz; i++) {
6818     PetscScalar sum = 0.0;
6819     for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += v[Bperm1[k]];
6820     Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
6821   }
6822   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, sendbuf, recvbuf, MPI_REPLACE));
6823 
6824   /* Add received remote entries to A and B */
6825   for (PetscCount i = 0; i < coo->Annz2; i++) {
6826     for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += recvbuf[Aperm2[k]];
6827   }
6828   for (PetscCount i = 0; i < coo->Bnnz2; i++) {
6829     for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += recvbuf[Bperm2[k]];
6830   }
6831   PetscCall(MatSeqAIJRestoreArray(A, &Aa));
6832   PetscCall(MatSeqAIJRestoreArray(B, &Ba));
6833   PetscFunctionReturn(PETSC_SUCCESS);
6834 }
6835 
6836 /*MC
6837    MATMPIAIJ - MATMPIAIJ = "mpiaij" - A matrix type to be used for parallel sparse matrices.
6838 
6839    Options Database Keys:
6840 . -mat_type mpiaij - sets the matrix type to `MATMPIAIJ` during a call to `MatSetFromOptions()`
6841 
6842    Level: beginner
6843 
6844    Notes:
6845    `MatSetValues()` may be called for this matrix type with a `NULL` argument for the numerical values,
6846     in this case the values associated with the rows and columns one passes in are set to zero
6847     in the matrix
6848 
6849     `MatSetOptions`(,`MAT_STRUCTURE_ONLY`,`PETSC_TRUE`) may be called for this matrix type. In this no
6850     space is allocated for the nonzero entries and any entries passed with `MatSetValues()` are ignored
6851 
6852 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJ`, `MATAIJ`, `MatCreateAIJ()`
6853 M*/
6854 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJ(Mat B)
6855 {
6856   Mat_MPIAIJ *b;
6857   PetscMPIInt size;
6858 
6859   PetscFunctionBegin;
6860   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
6861 
6862   PetscCall(PetscNew(&b));
6863   B->data       = (void *)b;
6864   B->ops[0]     = MatOps_Values;
6865   B->assembled  = PETSC_FALSE;
6866   B->insertmode = NOT_SET_VALUES;
6867   b->size       = size;
6868 
6869   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)B), &b->rank));
6870 
6871   /* build cache for off array entries formed */
6872   PetscCall(MatStashCreate_Private(PetscObjectComm((PetscObject)B), 1, &B->stash));
6873 
6874   b->donotstash  = PETSC_FALSE;
6875   b->colmap      = NULL;
6876   b->garray      = NULL;
6877   b->roworiented = PETSC_TRUE;
6878 
6879   /* stuff used for matrix vector multiply */
6880   b->lvec  = NULL;
6881   b->Mvctx = NULL;
6882 
6883   /* stuff for MatGetRow() */
6884   b->rowindices   = NULL;
6885   b->rowvalues    = NULL;
6886   b->getrowactive = PETSC_FALSE;
6887 
6888   /* flexible pointer used in CUSPARSE classes */
6889   b->spptr = NULL;
6890 
6891   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetUseScalableIncreaseOverlap_C", MatMPIAIJSetUseScalableIncreaseOverlap_MPIAIJ));
6892   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_MPIAIJ));
6893   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_MPIAIJ));
6894   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatIsTranspose_C", MatIsTranspose_MPIAIJ));
6895   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJ));
6896   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatResetPreallocation_C", MatResetPreallocation_MPIAIJ));
6897   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocationCSR_C", MatMPIAIJSetPreallocationCSR_MPIAIJ));
6898   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDiagonalScaleLocal_C", MatDiagonalScaleLocal_MPIAIJ));
6899   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijperm_C", MatConvert_MPIAIJ_MPIAIJPERM));
6900   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijsell_C", MatConvert_MPIAIJ_MPIAIJSELL));
6901 #if defined(PETSC_HAVE_CUDA)
6902   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijcusparse_C", MatConvert_MPIAIJ_MPIAIJCUSPARSE));
6903 #endif
6904 #if defined(PETSC_HAVE_HIP)
6905   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijhipsparse_C", MatConvert_MPIAIJ_MPIAIJHIPSPARSE));
6906 #endif
6907 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
6908   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijkokkos_C", MatConvert_MPIAIJ_MPIAIJKokkos));
6909 #endif
6910 #if defined(PETSC_HAVE_MKL_SPARSE)
6911   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijmkl_C", MatConvert_MPIAIJ_MPIAIJMKL));
6912 #endif
6913   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijcrl_C", MatConvert_MPIAIJ_MPIAIJCRL));
6914   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpibaij_C", MatConvert_MPIAIJ_MPIBAIJ));
6915   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpisbaij_C", MatConvert_MPIAIJ_MPISBAIJ));
6916   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpidense_C", MatConvert_MPIAIJ_MPIDense));
6917 #if defined(PETSC_HAVE_ELEMENTAL)
6918   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_elemental_C", MatConvert_MPIAIJ_Elemental));
6919 #endif
6920 #if defined(PETSC_HAVE_SCALAPACK)
6921   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_scalapack_C", MatConvert_AIJ_ScaLAPACK));
6922 #endif
6923   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_is_C", MatConvert_XAIJ_IS));
6924   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpisell_C", MatConvert_MPIAIJ_MPISELL));
6925 #if defined(PETSC_HAVE_HYPRE)
6926   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_hypre_C", MatConvert_AIJ_HYPRE));
6927   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_transpose_mpiaij_mpiaij_C", MatProductSetFromOptions_Transpose_AIJ_AIJ));
6928 #endif
6929   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_is_mpiaij_C", MatProductSetFromOptions_IS_XAIJ));
6930   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_mpiaij_mpiaij_C", MatProductSetFromOptions_MPIAIJ));
6931   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJ));
6932   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJ));
6933   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATMPIAIJ));
6934   PetscFunctionReturn(PETSC_SUCCESS);
6935 }
6936 
6937 /*@
6938   MatCreateMPIAIJWithSplitArrays - creates a `MATMPIAIJ` matrix using arrays that contain the "diagonal"
6939   and "off-diagonal" part of the matrix in CSR format.
6940 
6941   Collective
6942 
6943   Input Parameters:
6944 + comm - MPI communicator
6945 . m    - number of local rows (Cannot be `PETSC_DECIDE`)
6946 . n    - This value should be the same as the local size used in creating the
6947          x vector for the matrix-vector product $y = Ax$. (or `PETSC_DECIDE` to have
6948          calculated if `N` is given) For square matrices `n` is almost always `m`.
6949 . M    - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
6950 . N    - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
6951 . i    - row indices for "diagonal" portion of matrix; that is i[0] = 0, i[row] = i[row-1] + number of elements in that row of the matrix
6952 . j    - column indices, which must be local, i.e., based off the start column of the diagonal portion
6953 . a    - matrix values
6954 . oi   - row indices for "off-diagonal" portion of matrix; that is oi[0] = 0, oi[row] = oi[row-1] + number of elements in that row of the matrix
6955 . oj   - column indices, which must be global, representing global columns in the `MATMPIAIJ` matrix
6956 - oa   - matrix values
6957 
6958   Output Parameter:
6959 . mat - the matrix
6960 
6961   Level: advanced
6962 
6963   Notes:
6964   The `i`, `j`, and `a` arrays ARE NOT copied by this routine into the internal format used by PETSc (even in Fortran). The user
6965   must free the arrays once the matrix has been destroyed and not before.
6966 
6967   The `i` and `j` indices are 0 based
6968 
6969   See `MatCreateAIJ()` for the definition of "diagonal" and "off-diagonal" portion of the matrix
6970 
6971   This sets local rows and cannot be used to set off-processor values.
6972 
6973   Use of this routine is discouraged because it is inflexible and cumbersome to use. It is extremely rare that a
6974   legacy application natively assembles into exactly this split format. The code to do so is nontrivial and does
6975   not easily support in-place reassembly. It is recommended to use MatSetValues() (or a variant thereof) because
6976   the resulting assembly is easier to implement, will work with any matrix format, and the user does not have to
6977   keep track of the underlying array. Use `MatSetOption`(A,`MAT_NO_OFF_PROC_ENTRIES`,`PETSC_TRUE`) to disable all
6978   communication if it is known that only local entries will be set.
6979 
6980 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
6981           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithArrays()`
6982 @*/
6983 PetscErrorCode MatCreateMPIAIJWithSplitArrays(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt i[], PetscInt j[], PetscScalar a[], PetscInt oi[], PetscInt oj[], PetscScalar oa[], Mat *mat)
6984 {
6985   Mat_MPIAIJ *maij;
6986 
6987   PetscFunctionBegin;
6988   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
6989   PetscCheck(i[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
6990   PetscCheck(oi[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "oi (row indices) must start with 0");
6991   PetscCall(MatCreate(comm, mat));
6992   PetscCall(MatSetSizes(*mat, m, n, M, N));
6993   PetscCall(MatSetType(*mat, MATMPIAIJ));
6994   maij = (Mat_MPIAIJ *)(*mat)->data;
6995 
6996   (*mat)->preallocated = PETSC_TRUE;
6997 
6998   PetscCall(PetscLayoutSetUp((*mat)->rmap));
6999   PetscCall(PetscLayoutSetUp((*mat)->cmap));
7000 
7001   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, n, i, j, a, &maij->A));
7002   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, (*mat)->cmap->N, oi, oj, oa, &maij->B));
7003 
7004   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
7005   PetscCall(MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY));
7006   PetscCall(MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY));
7007   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE));
7008   PetscCall(MatSetOption(*mat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
7009   PetscFunctionReturn(PETSC_SUCCESS);
7010 }
7011 
7012 typedef struct {
7013   Mat       *mp;    /* intermediate products */
7014   PetscBool *mptmp; /* is the intermediate product temporary ? */
7015   PetscInt   cp;    /* number of intermediate products */
7016 
7017   /* support for MatGetBrowsOfAoCols_MPIAIJ for P_oth */
7018   PetscInt    *startsj_s, *startsj_r;
7019   PetscScalar *bufa;
7020   Mat          P_oth;
7021 
7022   /* may take advantage of merging product->B */
7023   Mat Bloc; /* B-local by merging diag and off-diag */
7024 
7025   /* cusparse does not have support to split between symbolic and numeric phases.
7026      When api_user is true, we don't need to update the numerical values
7027      of the temporary storage */
7028   PetscBool reusesym;
7029 
7030   /* support for COO values insertion */
7031   PetscScalar *coo_v, *coo_w; /* store on-process and off-process COO scalars, and used as MPI recv/send buffers respectively */
7032   PetscInt   **own;           /* own[i] points to address of on-process COO indices for Mat mp[i] */
7033   PetscInt   **off;           /* off[i] points to address of off-process COO indices for Mat mp[i] */
7034   PetscBool    hasoffproc;    /* if true, have off-process values insertion (i.e. AtB or PtAP) */
7035   PetscSF      sf;            /* used for non-local values insertion and memory malloc */
7036   PetscMemType mtype;
7037 
7038   /* customization */
7039   PetscBool abmerge;
7040   PetscBool P_oth_bind;
7041 } MatMatMPIAIJBACKEND;
7042 
7043 static PetscErrorCode MatDestroy_MatMatMPIAIJBACKEND(void *data)
7044 {
7045   MatMatMPIAIJBACKEND *mmdata = (MatMatMPIAIJBACKEND *)data;
7046   PetscInt             i;
7047 
7048   PetscFunctionBegin;
7049   PetscCall(PetscFree2(mmdata->startsj_s, mmdata->startsj_r));
7050   PetscCall(PetscFree(mmdata->bufa));
7051   PetscCall(PetscSFFree(mmdata->sf, mmdata->mtype, mmdata->coo_v));
7052   PetscCall(PetscSFFree(mmdata->sf, mmdata->mtype, mmdata->coo_w));
7053   PetscCall(MatDestroy(&mmdata->P_oth));
7054   PetscCall(MatDestroy(&mmdata->Bloc));
7055   PetscCall(PetscSFDestroy(&mmdata->sf));
7056   for (i = 0; i < mmdata->cp; i++) PetscCall(MatDestroy(&mmdata->mp[i]));
7057   PetscCall(PetscFree2(mmdata->mp, mmdata->mptmp));
7058   PetscCall(PetscFree(mmdata->own[0]));
7059   PetscCall(PetscFree(mmdata->own));
7060   PetscCall(PetscFree(mmdata->off[0]));
7061   PetscCall(PetscFree(mmdata->off));
7062   PetscCall(PetscFree(mmdata));
7063   PetscFunctionReturn(PETSC_SUCCESS);
7064 }
7065 
7066 /* Copy selected n entries with indices in idx[] of A to v[].
7067    If idx is NULL, copy the whole data array of A to v[]
7068  */
7069 static PetscErrorCode MatSeqAIJCopySubArray(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
7070 {
7071   PetscErrorCode (*f)(Mat, PetscInt, const PetscInt[], PetscScalar[]);
7072 
7073   PetscFunctionBegin;
7074   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", &f));
7075   if (f) {
7076     PetscCall((*f)(A, n, idx, v));
7077   } else {
7078     const PetscScalar *vv;
7079 
7080     PetscCall(MatSeqAIJGetArrayRead(A, &vv));
7081     if (n && idx) {
7082       PetscScalar    *w  = v;
7083       const PetscInt *oi = idx;
7084       PetscInt        j;
7085 
7086       for (j = 0; j < n; j++) *w++ = vv[*oi++];
7087     } else {
7088       PetscCall(PetscArraycpy(v, vv, n));
7089     }
7090     PetscCall(MatSeqAIJRestoreArrayRead(A, &vv));
7091   }
7092   PetscFunctionReturn(PETSC_SUCCESS);
7093 }
7094 
7095 static PetscErrorCode MatProductNumeric_MPIAIJBACKEND(Mat C)
7096 {
7097   MatMatMPIAIJBACKEND *mmdata;
7098   PetscInt             i, n_d, n_o;
7099 
7100   PetscFunctionBegin;
7101   MatCheckProduct(C, 1);
7102   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
7103   mmdata = (MatMatMPIAIJBACKEND *)C->product->data;
7104   if (!mmdata->reusesym) { /* update temporary matrices */
7105     if (mmdata->P_oth) PetscCall(MatGetBrowsOfAoCols_MPIAIJ(C->product->A, C->product->B, MAT_REUSE_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7106     if (mmdata->Bloc) PetscCall(MatMPIAIJGetLocalMatMerge(C->product->B, MAT_REUSE_MATRIX, NULL, &mmdata->Bloc));
7107   }
7108   mmdata->reusesym = PETSC_FALSE;
7109 
7110   for (i = 0; i < mmdata->cp; i++) {
7111     PetscCheck(mmdata->mp[i]->ops->productnumeric, PetscObjectComm((PetscObject)mmdata->mp[i]), PETSC_ERR_PLIB, "Missing numeric op for %s", MatProductTypes[mmdata->mp[i]->product->type]);
7112     PetscCall((*mmdata->mp[i]->ops->productnumeric)(mmdata->mp[i]));
7113   }
7114   for (i = 0, n_d = 0, n_o = 0; i < mmdata->cp; i++) {
7115     PetscInt noff;
7116 
7117     PetscCall(PetscIntCast(mmdata->off[i + 1] - mmdata->off[i], &noff));
7118     if (mmdata->mptmp[i]) continue;
7119     if (noff) {
7120       PetscInt nown;
7121 
7122       PetscCall(PetscIntCast(mmdata->own[i + 1] - mmdata->own[i], &nown));
7123       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], noff, mmdata->off[i], mmdata->coo_w + n_o));
7124       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], nown, mmdata->own[i], mmdata->coo_v + n_d));
7125       n_o += noff;
7126       n_d += nown;
7127     } else {
7128       Mat_SeqAIJ *mm = (Mat_SeqAIJ *)mmdata->mp[i]->data;
7129 
7130       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], mm->nz, NULL, mmdata->coo_v + n_d));
7131       n_d += mm->nz;
7132     }
7133   }
7134   if (mmdata->hasoffproc) { /* offprocess insertion */
7135     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_SCALAR, mmdata->coo_w, mmdata->coo_v + n_d));
7136     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_SCALAR, mmdata->coo_w, mmdata->coo_v + n_d));
7137   }
7138   PetscCall(MatSetValuesCOO(C, mmdata->coo_v, INSERT_VALUES));
7139   PetscFunctionReturn(PETSC_SUCCESS);
7140 }
7141 
7142 /* Support for Pt * A, A * P, or Pt * A * P */
7143 #define MAX_NUMBER_INTERMEDIATE 4
7144 PetscErrorCode MatProductSymbolic_MPIAIJBACKEND(Mat C)
7145 {
7146   Mat_Product           *product = C->product;
7147   Mat                    A, P, mp[MAX_NUMBER_INTERMEDIATE]; /* A, P and a series of intermediate matrices */
7148   Mat_MPIAIJ            *a, *p;
7149   MatMatMPIAIJBACKEND   *mmdata;
7150   ISLocalToGlobalMapping P_oth_l2g = NULL;
7151   IS                     glob      = NULL;
7152   const char            *prefix;
7153   char                   pprefix[256];
7154   const PetscInt        *globidx, *P_oth_idx;
7155   PetscInt               i, j, cp, m, n, M, N, *coo_i, *coo_j;
7156   PetscCount             ncoo, ncoo_d, ncoo_o, ncoo_oown;
7157   PetscInt               cmapt[MAX_NUMBER_INTERMEDIATE], rmapt[MAX_NUMBER_INTERMEDIATE]; /* col/row map type for each Mat in mp[]. */
7158                                                                                          /* type-0: consecutive, start from 0; type-1: consecutive with */
7159                                                                                          /* a base offset; type-2: sparse with a local to global map table */
7160   const PetscInt *cmapa[MAX_NUMBER_INTERMEDIATE], *rmapa[MAX_NUMBER_INTERMEDIATE];       /* col/row local to global map array (table) for type-2 map type */
7161 
7162   MatProductType ptype;
7163   PetscBool      mptmp[MAX_NUMBER_INTERMEDIATE], hasoffproc = PETSC_FALSE, iscuda, iship, iskokk;
7164   PetscMPIInt    size;
7165 
7166   PetscFunctionBegin;
7167   MatCheckProduct(C, 1);
7168   PetscCheck(!product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty");
7169   ptype = product->type;
7170   if (product->A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
7171     ptype                                          = MATPRODUCT_AB;
7172     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
7173   }
7174   switch (ptype) {
7175   case MATPRODUCT_AB:
7176     A          = product->A;
7177     P          = product->B;
7178     m          = A->rmap->n;
7179     n          = P->cmap->n;
7180     M          = A->rmap->N;
7181     N          = P->cmap->N;
7182     hasoffproc = PETSC_FALSE; /* will not scatter mat product values to other processes */
7183     break;
7184   case MATPRODUCT_AtB:
7185     P          = product->A;
7186     A          = product->B;
7187     m          = P->cmap->n;
7188     n          = A->cmap->n;
7189     M          = P->cmap->N;
7190     N          = A->cmap->N;
7191     hasoffproc = PETSC_TRUE;
7192     break;
7193   case MATPRODUCT_PtAP:
7194     A          = product->A;
7195     P          = product->B;
7196     m          = P->cmap->n;
7197     n          = P->cmap->n;
7198     M          = P->cmap->N;
7199     N          = P->cmap->N;
7200     hasoffproc = PETSC_TRUE;
7201     break;
7202   default:
7203     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
7204   }
7205   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)C), &size));
7206   if (size == 1) hasoffproc = PETSC_FALSE;
7207 
7208   /* defaults */
7209   for (i = 0; i < MAX_NUMBER_INTERMEDIATE; i++) {
7210     mp[i]    = NULL;
7211     mptmp[i] = PETSC_FALSE;
7212     rmapt[i] = -1;
7213     cmapt[i] = -1;
7214     rmapa[i] = NULL;
7215     cmapa[i] = NULL;
7216   }
7217 
7218   /* customization */
7219   PetscCall(PetscNew(&mmdata));
7220   mmdata->reusesym = product->api_user;
7221   if (ptype == MATPRODUCT_AB) {
7222     if (product->api_user) {
7223       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
7224       PetscCall(PetscOptionsBool("-matmatmult_backend_mergeB", "Merge product->B local matrices", "MatMatMult", mmdata->abmerge, &mmdata->abmerge, NULL));
7225       PetscCall(PetscOptionsBool("-matmatmult_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7226       PetscOptionsEnd();
7227     } else {
7228       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
7229       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_mergeB", "Merge product->B local matrices", "MatMatMult", mmdata->abmerge, &mmdata->abmerge, NULL));
7230       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7231       PetscOptionsEnd();
7232     }
7233   } else if (ptype == MATPRODUCT_PtAP) {
7234     if (product->api_user) {
7235       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
7236       PetscCall(PetscOptionsBool("-matptap_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7237       PetscOptionsEnd();
7238     } else {
7239       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
7240       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7241       PetscOptionsEnd();
7242     }
7243   }
7244   a = (Mat_MPIAIJ *)A->data;
7245   p = (Mat_MPIAIJ *)P->data;
7246   PetscCall(MatSetSizes(C, m, n, M, N));
7247   PetscCall(PetscLayoutSetUp(C->rmap));
7248   PetscCall(PetscLayoutSetUp(C->cmap));
7249   PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
7250   PetscCall(MatGetOptionsPrefix(C, &prefix));
7251 
7252   cp = 0;
7253   switch (ptype) {
7254   case MATPRODUCT_AB: /* A * P */
7255     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7256 
7257     /* A_diag * P_local (merged or not) */
7258     if (mmdata->abmerge) { /* P's diagonal and off-diag blocks are merged to one matrix, then multiplied by A_diag */
7259       /* P is product->B */
7260       PetscCall(MatMPIAIJGetLocalMatMerge(P, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7261       PetscCall(MatProductCreate(a->A, mmdata->Bloc, NULL, &mp[cp]));
7262       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7263       PetscCall(MatProductSetFill(mp[cp], product->fill));
7264       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7265       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7266       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7267       mp[cp]->product->api_user = product->api_user;
7268       PetscCall(MatProductSetFromOptions(mp[cp]));
7269       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7270       PetscCall(ISGetIndices(glob, &globidx));
7271       rmapt[cp] = 1;
7272       cmapt[cp] = 2;
7273       cmapa[cp] = globidx;
7274       mptmp[cp] = PETSC_FALSE;
7275       cp++;
7276     } else { /* A_diag * P_diag and A_diag * P_off */
7277       PetscCall(MatProductCreate(a->A, p->A, NULL, &mp[cp]));
7278       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7279       PetscCall(MatProductSetFill(mp[cp], product->fill));
7280       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7281       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7282       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7283       mp[cp]->product->api_user = product->api_user;
7284       PetscCall(MatProductSetFromOptions(mp[cp]));
7285       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7286       rmapt[cp] = 1;
7287       cmapt[cp] = 1;
7288       mptmp[cp] = PETSC_FALSE;
7289       cp++;
7290       PetscCall(MatProductCreate(a->A, p->B, NULL, &mp[cp]));
7291       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7292       PetscCall(MatProductSetFill(mp[cp], product->fill));
7293       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7294       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7295       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7296       mp[cp]->product->api_user = product->api_user;
7297       PetscCall(MatProductSetFromOptions(mp[cp]));
7298       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7299       rmapt[cp] = 1;
7300       cmapt[cp] = 2;
7301       cmapa[cp] = p->garray;
7302       mptmp[cp] = PETSC_FALSE;
7303       cp++;
7304     }
7305 
7306     /* A_off * P_other */
7307     if (mmdata->P_oth) {
7308       PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth, &P_oth_l2g)); /* make P_oth use local col ids */
7309       PetscCall(ISLocalToGlobalMappingGetIndices(P_oth_l2g, &P_oth_idx));
7310       PetscCall(MatSetType(mmdata->P_oth, ((PetscObject)a->B)->type_name));
7311       PetscCall(MatBindToCPU(mmdata->P_oth, mmdata->P_oth_bind));
7312       PetscCall(MatProductCreate(a->B, mmdata->P_oth, NULL, &mp[cp]));
7313       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7314       PetscCall(MatProductSetFill(mp[cp], product->fill));
7315       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7316       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7317       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7318       mp[cp]->product->api_user = product->api_user;
7319       PetscCall(MatProductSetFromOptions(mp[cp]));
7320       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7321       rmapt[cp] = 1;
7322       cmapt[cp] = 2;
7323       cmapa[cp] = P_oth_idx;
7324       mptmp[cp] = PETSC_FALSE;
7325       cp++;
7326     }
7327     break;
7328 
7329   case MATPRODUCT_AtB: /* (P^t * A): P_diag * A_loc + P_off * A_loc */
7330     /* A is product->B */
7331     PetscCall(MatMPIAIJGetLocalMatMerge(A, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7332     if (A == P) { /* when A==P, we can take advantage of the already merged mmdata->Bloc */
7333       PetscCall(MatProductCreate(mmdata->Bloc, mmdata->Bloc, NULL, &mp[cp]));
7334       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7335       PetscCall(MatProductSetFill(mp[cp], product->fill));
7336       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7337       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7338       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7339       mp[cp]->product->api_user = product->api_user;
7340       PetscCall(MatProductSetFromOptions(mp[cp]));
7341       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7342       PetscCall(ISGetIndices(glob, &globidx));
7343       rmapt[cp] = 2;
7344       rmapa[cp] = globidx;
7345       cmapt[cp] = 2;
7346       cmapa[cp] = globidx;
7347       mptmp[cp] = PETSC_FALSE;
7348       cp++;
7349     } else {
7350       PetscCall(MatProductCreate(p->A, mmdata->Bloc, NULL, &mp[cp]));
7351       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7352       PetscCall(MatProductSetFill(mp[cp], product->fill));
7353       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7354       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7355       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7356       mp[cp]->product->api_user = product->api_user;
7357       PetscCall(MatProductSetFromOptions(mp[cp]));
7358       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7359       PetscCall(ISGetIndices(glob, &globidx));
7360       rmapt[cp] = 1;
7361       cmapt[cp] = 2;
7362       cmapa[cp] = globidx;
7363       mptmp[cp] = PETSC_FALSE;
7364       cp++;
7365       PetscCall(MatProductCreate(p->B, mmdata->Bloc, NULL, &mp[cp]));
7366       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7367       PetscCall(MatProductSetFill(mp[cp], product->fill));
7368       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7369       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7370       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7371       mp[cp]->product->api_user = product->api_user;
7372       PetscCall(MatProductSetFromOptions(mp[cp]));
7373       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7374       rmapt[cp] = 2;
7375       rmapa[cp] = p->garray;
7376       cmapt[cp] = 2;
7377       cmapa[cp] = globidx;
7378       mptmp[cp] = PETSC_FALSE;
7379       cp++;
7380     }
7381     break;
7382   case MATPRODUCT_PtAP:
7383     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7384     /* P is product->B */
7385     PetscCall(MatMPIAIJGetLocalMatMerge(P, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7386     PetscCall(MatProductCreate(a->A, mmdata->Bloc, NULL, &mp[cp]));
7387     PetscCall(MatProductSetType(mp[cp], MATPRODUCT_PtAP));
7388     PetscCall(MatProductSetFill(mp[cp], product->fill));
7389     PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7390     PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7391     PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7392     mp[cp]->product->api_user = product->api_user;
7393     PetscCall(MatProductSetFromOptions(mp[cp]));
7394     PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7395     PetscCall(ISGetIndices(glob, &globidx));
7396     rmapt[cp] = 2;
7397     rmapa[cp] = globidx;
7398     cmapt[cp] = 2;
7399     cmapa[cp] = globidx;
7400     mptmp[cp] = PETSC_FALSE;
7401     cp++;
7402     if (mmdata->P_oth) {
7403       PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth, &P_oth_l2g));
7404       PetscCall(ISLocalToGlobalMappingGetIndices(P_oth_l2g, &P_oth_idx));
7405       PetscCall(MatSetType(mmdata->P_oth, ((PetscObject)a->B)->type_name));
7406       PetscCall(MatBindToCPU(mmdata->P_oth, mmdata->P_oth_bind));
7407       PetscCall(MatProductCreate(a->B, mmdata->P_oth, NULL, &mp[cp]));
7408       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7409       PetscCall(MatProductSetFill(mp[cp], product->fill));
7410       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7411       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7412       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7413       mp[cp]->product->api_user = product->api_user;
7414       PetscCall(MatProductSetFromOptions(mp[cp]));
7415       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7416       mptmp[cp] = PETSC_TRUE;
7417       cp++;
7418       PetscCall(MatProductCreate(mmdata->Bloc, mp[1], NULL, &mp[cp]));
7419       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7420       PetscCall(MatProductSetFill(mp[cp], product->fill));
7421       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7422       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7423       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7424       mp[cp]->product->api_user = product->api_user;
7425       PetscCall(MatProductSetFromOptions(mp[cp]));
7426       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7427       rmapt[cp] = 2;
7428       rmapa[cp] = globidx;
7429       cmapt[cp] = 2;
7430       cmapa[cp] = P_oth_idx;
7431       mptmp[cp] = PETSC_FALSE;
7432       cp++;
7433     }
7434     break;
7435   default:
7436     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
7437   }
7438   /* sanity check */
7439   if (size > 1)
7440     for (i = 0; i < cp; i++) PetscCheck(rmapt[i] != 2 || hasoffproc, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected offproc map type for product %" PetscInt_FMT, i);
7441 
7442   PetscCall(PetscMalloc2(cp, &mmdata->mp, cp, &mmdata->mptmp));
7443   for (i = 0; i < cp; i++) {
7444     mmdata->mp[i]    = mp[i];
7445     mmdata->mptmp[i] = mptmp[i];
7446   }
7447   mmdata->cp             = cp;
7448   C->product->data       = mmdata;
7449   C->product->destroy    = MatDestroy_MatMatMPIAIJBACKEND;
7450   C->ops->productnumeric = MatProductNumeric_MPIAIJBACKEND;
7451 
7452   /* memory type */
7453   mmdata->mtype = PETSC_MEMTYPE_HOST;
7454   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iscuda, MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, ""));
7455   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iship, MATSEQAIJHIPSPARSE, MATMPIAIJHIPSPARSE, ""));
7456   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iskokk, MATSEQAIJKOKKOS, MATMPIAIJKOKKOS, ""));
7457   if (iscuda) mmdata->mtype = PETSC_MEMTYPE_CUDA;
7458   else if (iship) mmdata->mtype = PETSC_MEMTYPE_HIP;
7459   else if (iskokk) mmdata->mtype = PETSC_MEMTYPE_KOKKOS;
7460 
7461   /* prepare coo coordinates for values insertion */
7462 
7463   /* count total nonzeros of those intermediate seqaij Mats
7464     ncoo_d:    # of nonzeros of matrices that do not have offproc entries
7465     ncoo_o:    # of nonzeros (of matrices that might have offproc entries) that will be inserted to remote procs
7466     ncoo_oown: # of nonzeros (of matrices that might have offproc entries) that will be inserted locally
7467   */
7468   for (cp = 0, ncoo_d = 0, ncoo_o = 0, ncoo_oown = 0; cp < mmdata->cp; cp++) {
7469     Mat_SeqAIJ *mm = (Mat_SeqAIJ *)mp[cp]->data;
7470     if (mptmp[cp]) continue;
7471     if (rmapt[cp] == 2 && hasoffproc) { /* the rows need to be scatter to all processes (might include self) */
7472       const PetscInt *rmap = rmapa[cp];
7473       const PetscInt  mr   = mp[cp]->rmap->n;
7474       const PetscInt  rs   = C->rmap->rstart;
7475       const PetscInt  re   = C->rmap->rend;
7476       const PetscInt *ii   = mm->i;
7477       for (i = 0; i < mr; i++) {
7478         const PetscInt gr = rmap[i];
7479         const PetscInt nz = ii[i + 1] - ii[i];
7480         if (gr < rs || gr >= re) ncoo_o += nz; /* this row is offproc */
7481         else ncoo_oown += nz;                  /* this row is local */
7482       }
7483     } else ncoo_d += mm->nz;
7484   }
7485 
7486   /*
7487     ncoo: total number of nonzeros (including those inserted by remote procs) belonging to this proc
7488 
7489     ncoo = ncoo_d + ncoo_oown + ncoo2, which ncoo2 is number of nonzeros inserted to me by other procs.
7490 
7491     off[0] points to a big index array, which is shared by off[1,2,...]. Similarly, for own[0].
7492 
7493     off[p]: points to the segment for matrix mp[p], storing location of nonzeros that mp[p] will insert to others
7494     own[p]: points to the segment for matrix mp[p], storing location of nonzeros that mp[p] will insert locally
7495     so, off[p+1]-off[p] is the number of nonzeros that mp[p] will send to others.
7496 
7497     coo_i/j/v[]: [ncoo] row/col/val of nonzeros belonging to this proc.
7498     Ex. coo_i[]: the beginning part (of size ncoo_d + ncoo_oown) stores i of local nonzeros, and the remaining part stores i of nonzeros I will receive.
7499   */
7500   PetscCall(PetscCalloc1(mmdata->cp + 1, &mmdata->off)); /* +1 to make a csr-like data structure */
7501   PetscCall(PetscCalloc1(mmdata->cp + 1, &mmdata->own));
7502 
7503   /* gather (i,j) of nonzeros inserted by remote procs */
7504   if (hasoffproc) {
7505     PetscSF  msf;
7506     PetscInt ncoo2, *coo_i2, *coo_j2;
7507 
7508     PetscCall(PetscMalloc1(ncoo_o, &mmdata->off[0]));
7509     PetscCall(PetscMalloc1(ncoo_oown, &mmdata->own[0]));
7510     PetscCall(PetscMalloc2(ncoo_o, &coo_i, ncoo_o, &coo_j)); /* to collect (i,j) of entries to be sent to others */
7511 
7512     for (cp = 0, ncoo_o = 0; cp < mmdata->cp; cp++) {
7513       Mat_SeqAIJ *mm     = (Mat_SeqAIJ *)mp[cp]->data;
7514       PetscInt   *idxoff = mmdata->off[cp];
7515       PetscInt   *idxown = mmdata->own[cp];
7516       if (!mptmp[cp] && rmapt[cp] == 2) { /* row map is sparse */
7517         const PetscInt *rmap = rmapa[cp];
7518         const PetscInt *cmap = cmapa[cp];
7519         const PetscInt *ii   = mm->i;
7520         PetscInt       *coi  = coo_i + ncoo_o;
7521         PetscInt       *coj  = coo_j + ncoo_o;
7522         const PetscInt  mr   = mp[cp]->rmap->n;
7523         const PetscInt  rs   = C->rmap->rstart;
7524         const PetscInt  re   = C->rmap->rend;
7525         const PetscInt  cs   = C->cmap->rstart;
7526         for (i = 0; i < mr; i++) {
7527           const PetscInt *jj = mm->j + ii[i];
7528           const PetscInt  gr = rmap[i];
7529           const PetscInt  nz = ii[i + 1] - ii[i];
7530           if (gr < rs || gr >= re) { /* this is an offproc row */
7531             for (j = ii[i]; j < ii[i + 1]; j++) {
7532               *coi++    = gr;
7533               *idxoff++ = j;
7534             }
7535             if (!cmapt[cp]) { /* already global */
7536               for (j = 0; j < nz; j++) *coj++ = jj[j];
7537             } else if (cmapt[cp] == 1) { /* local to global for owned columns of C */
7538               for (j = 0; j < nz; j++) *coj++ = jj[j] + cs;
7539             } else { /* offdiag */
7540               for (j = 0; j < nz; j++) *coj++ = cmap[jj[j]];
7541             }
7542             ncoo_o += nz;
7543           } else { /* this is a local row */
7544             for (j = ii[i]; j < ii[i + 1]; j++) *idxown++ = j;
7545           }
7546         }
7547       }
7548       mmdata->off[cp + 1] = idxoff;
7549       mmdata->own[cp + 1] = idxown;
7550     }
7551 
7552     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)C), &mmdata->sf));
7553     PetscInt incoo_o;
7554     PetscCall(PetscIntCast(ncoo_o, &incoo_o));
7555     PetscCall(PetscSFSetGraphLayout(mmdata->sf, C->rmap, incoo_o /*nleaves*/, NULL /*ilocal*/, PETSC_OWN_POINTER, coo_i));
7556     PetscCall(PetscSFGetMultiSF(mmdata->sf, &msf));
7557     PetscCall(PetscSFGetGraph(msf, &ncoo2 /*nroots*/, NULL, NULL, NULL));
7558     ncoo = ncoo_d + ncoo_oown + ncoo2;
7559     PetscCall(PetscMalloc2(ncoo, &coo_i2, ncoo, &coo_j2));
7560     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_INT, coo_i, coo_i2 + ncoo_d + ncoo_oown)); /* put (i,j) of remote nonzeros at back */
7561     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_INT, coo_i, coo_i2 + ncoo_d + ncoo_oown));
7562     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_INT, coo_j, coo_j2 + ncoo_d + ncoo_oown));
7563     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_INT, coo_j, coo_j2 + ncoo_d + ncoo_oown));
7564     PetscCall(PetscFree2(coo_i, coo_j));
7565     /* allocate MPI send buffer to collect nonzero values to be sent to remote procs */
7566     PetscCall(PetscSFMalloc(mmdata->sf, mmdata->mtype, ncoo_o * sizeof(PetscScalar), (void **)&mmdata->coo_w));
7567     coo_i = coo_i2;
7568     coo_j = coo_j2;
7569   } else { /* no offproc values insertion */
7570     ncoo = ncoo_d;
7571     PetscCall(PetscMalloc2(ncoo, &coo_i, ncoo, &coo_j));
7572 
7573     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)C), &mmdata->sf));
7574     PetscCall(PetscSFSetGraph(mmdata->sf, 0, 0, NULL, PETSC_OWN_POINTER, NULL, PETSC_OWN_POINTER));
7575     PetscCall(PetscSFSetUp(mmdata->sf));
7576   }
7577   mmdata->hasoffproc = hasoffproc;
7578 
7579   /* gather (i,j) of nonzeros inserted locally */
7580   for (cp = 0, ncoo_d = 0; cp < mmdata->cp; cp++) {
7581     Mat_SeqAIJ     *mm   = (Mat_SeqAIJ *)mp[cp]->data;
7582     PetscInt       *coi  = coo_i + ncoo_d;
7583     PetscInt       *coj  = coo_j + ncoo_d;
7584     const PetscInt *jj   = mm->j;
7585     const PetscInt *ii   = mm->i;
7586     const PetscInt *cmap = cmapa[cp];
7587     const PetscInt *rmap = rmapa[cp];
7588     const PetscInt  mr   = mp[cp]->rmap->n;
7589     const PetscInt  rs   = C->rmap->rstart;
7590     const PetscInt  re   = C->rmap->rend;
7591     const PetscInt  cs   = C->cmap->rstart;
7592 
7593     if (mptmp[cp]) continue;
7594     if (rmapt[cp] == 1) { /* consecutive rows */
7595       /* fill coo_i */
7596       for (i = 0; i < mr; i++) {
7597         const PetscInt gr = i + rs;
7598         for (j = ii[i]; j < ii[i + 1]; j++) coi[j] = gr;
7599       }
7600       /* fill coo_j */
7601       if (!cmapt[cp]) { /* type-0, already global */
7602         PetscCall(PetscArraycpy(coj, jj, mm->nz));
7603       } else if (cmapt[cp] == 1) {                        /* type-1, local to global for consecutive columns of C */
7604         for (j = 0; j < mm->nz; j++) coj[j] = jj[j] + cs; /* lid + col start */
7605       } else {                                            /* type-2, local to global for sparse columns */
7606         for (j = 0; j < mm->nz; j++) coj[j] = cmap[jj[j]];
7607       }
7608       ncoo_d += mm->nz;
7609     } else if (rmapt[cp] == 2) { /* sparse rows */
7610       for (i = 0; i < mr; i++) {
7611         const PetscInt *jj = mm->j + ii[i];
7612         const PetscInt  gr = rmap[i];
7613         const PetscInt  nz = ii[i + 1] - ii[i];
7614         if (gr >= rs && gr < re) { /* local rows */
7615           for (j = ii[i]; j < ii[i + 1]; j++) *coi++ = gr;
7616           if (!cmapt[cp]) { /* type-0, already global */
7617             for (j = 0; j < nz; j++) *coj++ = jj[j];
7618           } else if (cmapt[cp] == 1) { /* local to global for owned columns of C */
7619             for (j = 0; j < nz; j++) *coj++ = jj[j] + cs;
7620           } else { /* type-2, local to global for sparse columns */
7621             for (j = 0; j < nz; j++) *coj++ = cmap[jj[j]];
7622           }
7623           ncoo_d += nz;
7624         }
7625       }
7626     }
7627   }
7628   if (glob) PetscCall(ISRestoreIndices(glob, &globidx));
7629   PetscCall(ISDestroy(&glob));
7630   if (P_oth_l2g) PetscCall(ISLocalToGlobalMappingRestoreIndices(P_oth_l2g, &P_oth_idx));
7631   PetscCall(ISLocalToGlobalMappingDestroy(&P_oth_l2g));
7632   /* allocate an array to store all nonzeros (inserted locally or remotely) belonging to this proc */
7633   PetscCall(PetscSFMalloc(mmdata->sf, mmdata->mtype, ncoo * sizeof(PetscScalar), (void **)&mmdata->coo_v));
7634 
7635   /* preallocate with COO data */
7636   PetscCall(MatSetPreallocationCOO(C, ncoo, coo_i, coo_j));
7637   PetscCall(PetscFree2(coo_i, coo_j));
7638   PetscFunctionReturn(PETSC_SUCCESS);
7639 }
7640 
7641 PetscErrorCode MatProductSetFromOptions_MPIAIJBACKEND(Mat mat)
7642 {
7643   Mat_Product *product = mat->product;
7644 #if defined(PETSC_HAVE_DEVICE)
7645   PetscBool match  = PETSC_FALSE;
7646   PetscBool usecpu = PETSC_FALSE;
7647 #else
7648   PetscBool match = PETSC_TRUE;
7649 #endif
7650 
7651   PetscFunctionBegin;
7652   MatCheckProduct(mat, 1);
7653 #if defined(PETSC_HAVE_DEVICE)
7654   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, ((PetscObject)product->A)->type_name, &match));
7655   if (match) { /* we can always fallback to the CPU if requested */
7656     switch (product->type) {
7657     case MATPRODUCT_AB:
7658       if (product->api_user) {
7659         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
7660         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
7661         PetscOptionsEnd();
7662       } else {
7663         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
7664         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
7665         PetscOptionsEnd();
7666       }
7667       break;
7668     case MATPRODUCT_AtB:
7669       if (product->api_user) {
7670         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
7671         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
7672         PetscOptionsEnd();
7673       } else {
7674         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
7675         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
7676         PetscOptionsEnd();
7677       }
7678       break;
7679     case MATPRODUCT_PtAP:
7680       if (product->api_user) {
7681         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
7682         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
7683         PetscOptionsEnd();
7684       } else {
7685         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
7686         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
7687         PetscOptionsEnd();
7688       }
7689       break;
7690     default:
7691       break;
7692     }
7693     match = (PetscBool)!usecpu;
7694   }
7695 #endif
7696   if (match) {
7697     switch (product->type) {
7698     case MATPRODUCT_AB:
7699     case MATPRODUCT_AtB:
7700     case MATPRODUCT_PtAP:
7701       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJBACKEND;
7702       break;
7703     default:
7704       break;
7705     }
7706   }
7707   /* fallback to MPIAIJ ops */
7708   if (!mat->ops->productsymbolic) PetscCall(MatProductSetFromOptions_MPIAIJ(mat));
7709   PetscFunctionReturn(PETSC_SUCCESS);
7710 }
7711 
7712 /*
7713    Produces a set of block column indices of the matrix row, one for each block represented in the original row
7714 
7715    n - the number of block indices in cc[]
7716    cc - the block indices (must be large enough to contain the indices)
7717 */
7718 static inline PetscErrorCode MatCollapseRow(Mat Amat, PetscInt row, PetscInt bs, PetscInt *n, PetscInt *cc)
7719 {
7720   PetscInt        cnt = -1, nidx, j;
7721   const PetscInt *idx;
7722 
7723   PetscFunctionBegin;
7724   PetscCall(MatGetRow(Amat, row, &nidx, &idx, NULL));
7725   if (nidx) {
7726     cnt     = 0;
7727     cc[cnt] = idx[0] / bs;
7728     for (j = 1; j < nidx; j++) {
7729       if (cc[cnt] < idx[j] / bs) cc[++cnt] = idx[j] / bs;
7730     }
7731   }
7732   PetscCall(MatRestoreRow(Amat, row, &nidx, &idx, NULL));
7733   *n = cnt + 1;
7734   PetscFunctionReturn(PETSC_SUCCESS);
7735 }
7736 
7737 /*
7738     Produces a set of block column indices of the matrix block row, one for each block represented in the original set of rows
7739 
7740     ncollapsed - the number of block indices
7741     collapsed - the block indices (must be large enough to contain the indices)
7742 */
7743 static inline PetscErrorCode MatCollapseRows(Mat Amat, PetscInt start, PetscInt bs, PetscInt *w0, PetscInt *w1, PetscInt *w2, PetscInt *ncollapsed, PetscInt **collapsed)
7744 {
7745   PetscInt i, nprev, *cprev = w0, ncur = 0, *ccur = w1, *merged = w2, *cprevtmp;
7746 
7747   PetscFunctionBegin;
7748   PetscCall(MatCollapseRow(Amat, start, bs, &nprev, cprev));
7749   for (i = start + 1; i < start + bs; i++) {
7750     PetscCall(MatCollapseRow(Amat, i, bs, &ncur, ccur));
7751     PetscCall(PetscMergeIntArray(nprev, cprev, ncur, ccur, &nprev, &merged));
7752     cprevtmp = cprev;
7753     cprev    = merged;
7754     merged   = cprevtmp;
7755   }
7756   *ncollapsed = nprev;
7757   if (collapsed) *collapsed = cprev;
7758   PetscFunctionReturn(PETSC_SUCCESS);
7759 }
7760 
7761 /*
7762  MatCreateGraph_Simple_AIJ - create simple scalar matrix (graph) from potentially blocked matrix
7763 
7764  Input Parameter:
7765  . Amat - matrix
7766  - symmetrize - make the result symmetric
7767  + scale - scale with diagonal
7768 
7769  Output Parameter:
7770  . a_Gmat - output scalar graph >= 0
7771 
7772 */
7773 PETSC_INTERN PetscErrorCode MatCreateGraph_Simple_AIJ(Mat Amat, PetscBool symmetrize, PetscBool scale, PetscReal filter, PetscInt index_size, PetscInt index[], Mat *a_Gmat)
7774 {
7775   PetscInt  Istart, Iend, Ii, jj, kk, ncols, nloc, NN, MM, bs;
7776   MPI_Comm  comm;
7777   Mat       Gmat;
7778   PetscBool ismpiaij, isseqaij;
7779   Mat       a, b, c;
7780   MatType   jtype;
7781 
7782   PetscFunctionBegin;
7783   PetscCall(PetscObjectGetComm((PetscObject)Amat, &comm));
7784   PetscCall(MatGetOwnershipRange(Amat, &Istart, &Iend));
7785   PetscCall(MatGetSize(Amat, &MM, &NN));
7786   PetscCall(MatGetBlockSize(Amat, &bs));
7787   nloc = (Iend - Istart) / bs;
7788 
7789   PetscCall(PetscObjectBaseTypeCompare((PetscObject)Amat, MATSEQAIJ, &isseqaij));
7790   PetscCall(PetscObjectBaseTypeCompare((PetscObject)Amat, MATMPIAIJ, &ismpiaij));
7791   PetscCheck(isseqaij || ismpiaij, comm, PETSC_ERR_USER, "Require (MPI)AIJ matrix type");
7792 
7793   /* TODO GPU: these calls are potentially expensive if matrices are large and we want to use the GPU */
7794   /* A solution consists in providing a new API, MatAIJGetCollapsedAIJ, and each class can provide a fast
7795      implementation */
7796   if (bs > 1) {
7797     PetscCall(MatGetType(Amat, &jtype));
7798     PetscCall(MatCreate(comm, &Gmat));
7799     PetscCall(MatSetType(Gmat, jtype));
7800     PetscCall(MatSetSizes(Gmat, nloc, nloc, PETSC_DETERMINE, PETSC_DETERMINE));
7801     PetscCall(MatSetBlockSizes(Gmat, 1, 1));
7802     if (isseqaij || ((Mat_MPIAIJ *)Amat->data)->garray) {
7803       PetscInt  *d_nnz, *o_nnz;
7804       MatScalar *aa, val, *AA;
7805       PetscInt  *aj, *ai, *AJ, nc, nmax = 0;
7806 
7807       if (isseqaij) {
7808         a = Amat;
7809         b = NULL;
7810       } else {
7811         Mat_MPIAIJ *d = (Mat_MPIAIJ *)Amat->data;
7812         a             = d->A;
7813         b             = d->B;
7814       }
7815       PetscCall(PetscInfo(Amat, "New bs>1 Graph. nloc=%" PetscInt_FMT "\n", nloc));
7816       PetscCall(PetscMalloc2(nloc, &d_nnz, isseqaij ? 0 : nloc, &o_nnz));
7817       for (c = a, kk = 0; c && kk < 2; c = b, kk++) {
7818         PetscInt       *nnz = (c == a) ? d_nnz : o_nnz;
7819         const PetscInt *cols1, *cols2;
7820 
7821         for (PetscInt brow = 0, nc1, nc2, ok = 1; brow < nloc * bs; brow += bs) { // block rows
7822           PetscCall(MatGetRow(c, brow, &nc2, &cols2, NULL));
7823           nnz[brow / bs] = nc2 / bs;
7824           if (nc2 % bs) ok = 0;
7825           if (nnz[brow / bs] > nmax) nmax = nnz[brow / bs];
7826           for (PetscInt ii = 1; ii < bs; ii++) { // check for non-dense blocks
7827             PetscCall(MatGetRow(c, brow + ii, &nc1, &cols1, NULL));
7828             if (nc1 != nc2) ok = 0;
7829             else {
7830               for (PetscInt jj = 0; jj < nc1 && ok == 1; jj++) {
7831                 if (cols1[jj] != cols2[jj]) ok = 0;
7832                 if (cols1[jj] % bs != jj % bs) ok = 0;
7833               }
7834             }
7835             PetscCall(MatRestoreRow(c, brow + ii, &nc1, &cols1, NULL));
7836           }
7837           PetscCall(MatRestoreRow(c, brow, &nc2, &cols2, NULL));
7838           if (!ok) {
7839             PetscCall(PetscFree2(d_nnz, o_nnz));
7840             PetscCall(PetscInfo(Amat, "Found sparse blocks - revert to slow method\n"));
7841             goto old_bs;
7842           }
7843         }
7844       }
7845       PetscCall(MatSeqAIJSetPreallocation(Gmat, 0, d_nnz));
7846       PetscCall(MatMPIAIJSetPreallocation(Gmat, 0, d_nnz, 0, o_nnz));
7847       PetscCall(PetscFree2(d_nnz, o_nnz));
7848       PetscCall(PetscMalloc2(nmax, &AA, nmax, &AJ));
7849       // diag
7850       for (PetscInt brow = 0, n, grow; brow < nloc * bs; brow += bs) { // block rows
7851         Mat_SeqAIJ *aseq = (Mat_SeqAIJ *)a->data;
7852 
7853         ai = aseq->i;
7854         n  = ai[brow + 1] - ai[brow];
7855         aj = aseq->j + ai[brow];
7856         for (PetscInt k = 0; k < n; k += bs) {   // block columns
7857           AJ[k / bs] = aj[k] / bs + Istart / bs; // diag starts at (Istart,Istart)
7858           val        = 0;
7859           if (index_size == 0) {
7860             for (PetscInt ii = 0; ii < bs; ii++) { // rows in block
7861               aa = aseq->a + ai[brow + ii] + k;
7862               for (PetscInt jj = 0; jj < bs; jj++) {    // columns in block
7863                 val += PetscAbs(PetscRealPart(aa[jj])); // a sort of norm
7864               }
7865             }
7866           } else {                                            // use (index,index) value if provided
7867             for (PetscInt iii = 0; iii < index_size; iii++) { // rows in block
7868               PetscInt ii = index[iii];
7869               aa          = aseq->a + ai[brow + ii] + k;
7870               for (PetscInt jjj = 0; jjj < index_size; jjj++) { // columns in block
7871                 PetscInt jj = index[jjj];
7872                 val += PetscAbs(PetscRealPart(aa[jj]));
7873               }
7874             }
7875           }
7876           PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs (%" PetscInt_FMT ") >= nmax (%" PetscInt_FMT ")", k / bs, nmax);
7877           AA[k / bs] = val;
7878         }
7879         grow = Istart / bs + brow / bs;
7880         PetscCall(MatSetValues(Gmat, 1, &grow, n / bs, AJ, AA, ADD_VALUES));
7881       }
7882       // off-diag
7883       if (ismpiaij) {
7884         Mat_MPIAIJ        *aij = (Mat_MPIAIJ *)Amat->data;
7885         const PetscScalar *vals;
7886         const PetscInt    *cols, *garray = aij->garray;
7887 
7888         PetscCheck(garray, PETSC_COMM_SELF, PETSC_ERR_USER, "No garray ?");
7889         for (PetscInt brow = 0, grow; brow < nloc * bs; brow += bs) { // block rows
7890           PetscCall(MatGetRow(b, brow, &ncols, &cols, NULL));
7891           for (PetscInt k = 0, cidx = 0; k < ncols; k += bs, cidx++) {
7892             PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs >= nmax");
7893             AA[k / bs] = 0;
7894             AJ[cidx]   = garray[cols[k]] / bs;
7895           }
7896           nc = ncols / bs;
7897           PetscCall(MatRestoreRow(b, brow, &ncols, &cols, NULL));
7898           if (index_size == 0) {
7899             for (PetscInt ii = 0; ii < bs; ii++) { // rows in block
7900               PetscCall(MatGetRow(b, brow + ii, &ncols, &cols, &vals));
7901               for (PetscInt k = 0; k < ncols; k += bs) {
7902                 for (PetscInt jj = 0; jj < bs; jj++) { // cols in block
7903                   PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs (%" PetscInt_FMT ") >= nmax (%" PetscInt_FMT ")", k / bs, nmax);
7904                   AA[k / bs] += PetscAbs(PetscRealPart(vals[k + jj]));
7905                 }
7906               }
7907               PetscCall(MatRestoreRow(b, brow + ii, &ncols, &cols, &vals));
7908             }
7909           } else {                                            // use (index,index) value if provided
7910             for (PetscInt iii = 0; iii < index_size; iii++) { // rows in block
7911               PetscInt ii = index[iii];
7912               PetscCall(MatGetRow(b, brow + ii, &ncols, &cols, &vals));
7913               for (PetscInt k = 0; k < ncols; k += bs) {
7914                 for (PetscInt jjj = 0; jjj < index_size; jjj++) { // cols in block
7915                   PetscInt jj = index[jjj];
7916                   AA[k / bs] += PetscAbs(PetscRealPart(vals[k + jj]));
7917                 }
7918               }
7919               PetscCall(MatRestoreRow(b, brow + ii, &ncols, &cols, &vals));
7920             }
7921           }
7922           grow = Istart / bs + brow / bs;
7923           PetscCall(MatSetValues(Gmat, 1, &grow, nc, AJ, AA, ADD_VALUES));
7924         }
7925       }
7926       PetscCall(MatAssemblyBegin(Gmat, MAT_FINAL_ASSEMBLY));
7927       PetscCall(MatAssemblyEnd(Gmat, MAT_FINAL_ASSEMBLY));
7928       PetscCall(PetscFree2(AA, AJ));
7929     } else {
7930       const PetscScalar *vals;
7931       const PetscInt    *idx;
7932       PetscInt          *d_nnz, *o_nnz, *w0, *w1, *w2;
7933     old_bs:
7934       /*
7935        Determine the preallocation needed for the scalar matrix derived from the vector matrix.
7936        */
7937       PetscCall(PetscInfo(Amat, "OLD bs>1 CreateGraph\n"));
7938       PetscCall(PetscMalloc2(nloc, &d_nnz, isseqaij ? 0 : nloc, &o_nnz));
7939       if (isseqaij) {
7940         PetscInt max_d_nnz;
7941 
7942         /*
7943          Determine exact preallocation count for (sequential) scalar matrix
7944          */
7945         PetscCall(MatSeqAIJGetMaxRowNonzeros(Amat, &max_d_nnz));
7946         max_d_nnz = PetscMin(nloc, bs * max_d_nnz);
7947         PetscCall(PetscMalloc3(max_d_nnz, &w0, max_d_nnz, &w1, max_d_nnz, &w2));
7948         for (Ii = 0, jj = 0; Ii < Iend; Ii += bs, jj++) PetscCall(MatCollapseRows(Amat, Ii, bs, w0, w1, w2, &d_nnz[jj], NULL));
7949         PetscCall(PetscFree3(w0, w1, w2));
7950       } else if (ismpiaij) {
7951         Mat             Daij, Oaij;
7952         const PetscInt *garray;
7953         PetscInt        max_d_nnz;
7954 
7955         PetscCall(MatMPIAIJGetSeqAIJ(Amat, &Daij, &Oaij, &garray));
7956         /*
7957          Determine exact preallocation count for diagonal block portion of scalar matrix
7958          */
7959         PetscCall(MatSeqAIJGetMaxRowNonzeros(Daij, &max_d_nnz));
7960         max_d_nnz = PetscMin(nloc, bs * max_d_nnz);
7961         PetscCall(PetscMalloc3(max_d_nnz, &w0, max_d_nnz, &w1, max_d_nnz, &w2));
7962         for (Ii = 0, jj = 0; Ii < Iend - Istart; Ii += bs, jj++) PetscCall(MatCollapseRows(Daij, Ii, bs, w0, w1, w2, &d_nnz[jj], NULL));
7963         PetscCall(PetscFree3(w0, w1, w2));
7964         /*
7965          Over estimate (usually grossly over), preallocation count for off-diagonal portion of scalar matrix
7966          */
7967         for (Ii = 0, jj = 0; Ii < Iend - Istart; Ii += bs, jj++) {
7968           o_nnz[jj] = 0;
7969           for (kk = 0; kk < bs; kk++) { /* rows that get collapsed to a single row */
7970             PetscCall(MatGetRow(Oaij, Ii + kk, &ncols, NULL, NULL));
7971             o_nnz[jj] += ncols;
7972             PetscCall(MatRestoreRow(Oaij, Ii + kk, &ncols, NULL, NULL));
7973           }
7974           if (o_nnz[jj] > (NN / bs - nloc)) o_nnz[jj] = NN / bs - nloc;
7975         }
7976       } else SETERRQ(comm, PETSC_ERR_USER, "Require AIJ matrix type");
7977       /* get scalar copy (norms) of matrix */
7978       PetscCall(MatSeqAIJSetPreallocation(Gmat, 0, d_nnz));
7979       PetscCall(MatMPIAIJSetPreallocation(Gmat, 0, d_nnz, 0, o_nnz));
7980       PetscCall(PetscFree2(d_nnz, o_nnz));
7981       for (Ii = Istart; Ii < Iend; Ii++) {
7982         PetscInt dest_row = Ii / bs;
7983 
7984         PetscCall(MatGetRow(Amat, Ii, &ncols, &idx, &vals));
7985         for (jj = 0; jj < ncols; jj++) {
7986           PetscInt    dest_col = idx[jj] / bs;
7987           PetscScalar sv       = PetscAbs(PetscRealPart(vals[jj]));
7988 
7989           PetscCall(MatSetValues(Gmat, 1, &dest_row, 1, &dest_col, &sv, ADD_VALUES));
7990         }
7991         PetscCall(MatRestoreRow(Amat, Ii, &ncols, &idx, &vals));
7992       }
7993       PetscCall(MatAssemblyBegin(Gmat, MAT_FINAL_ASSEMBLY));
7994       PetscCall(MatAssemblyEnd(Gmat, MAT_FINAL_ASSEMBLY));
7995     }
7996   } else {
7997     if (symmetrize || filter >= 0 || scale) PetscCall(MatDuplicate(Amat, MAT_COPY_VALUES, &Gmat));
7998     else {
7999       Gmat = Amat;
8000       PetscCall(PetscObjectReference((PetscObject)Gmat));
8001     }
8002     if (isseqaij) {
8003       a = Gmat;
8004       b = NULL;
8005     } else {
8006       Mat_MPIAIJ *d = (Mat_MPIAIJ *)Gmat->data;
8007       a             = d->A;
8008       b             = d->B;
8009     }
8010     if (filter >= 0 || scale) {
8011       /* take absolute value of each entry */
8012       for (c = a, kk = 0; c && kk < 2; c = b, kk++) {
8013         MatInfo      info;
8014         PetscScalar *avals;
8015 
8016         PetscCall(MatGetInfo(c, MAT_LOCAL, &info));
8017         PetscCall(MatSeqAIJGetArray(c, &avals));
8018         for (int jj = 0; jj < info.nz_used; jj++) avals[jj] = PetscAbsScalar(avals[jj]);
8019         PetscCall(MatSeqAIJRestoreArray(c, &avals));
8020       }
8021     }
8022   }
8023   if (symmetrize) {
8024     PetscBool isset, issym;
8025 
8026     PetscCall(MatIsSymmetricKnown(Amat, &isset, &issym));
8027     if (!isset || !issym) {
8028       Mat matTrans;
8029 
8030       PetscCall(MatTranspose(Gmat, MAT_INITIAL_MATRIX, &matTrans));
8031       PetscCall(MatAXPY(Gmat, 1.0, matTrans, Gmat->structurally_symmetric == PETSC_BOOL3_TRUE ? SAME_NONZERO_PATTERN : DIFFERENT_NONZERO_PATTERN));
8032       PetscCall(MatDestroy(&matTrans));
8033     }
8034     PetscCall(MatSetOption(Gmat, MAT_SYMMETRIC, PETSC_TRUE));
8035   } else if (Amat != Gmat) PetscCall(MatPropagateSymmetryOptions(Amat, Gmat));
8036   if (scale) {
8037     /* scale c for all diagonal values = 1 or -1 */
8038     Vec diag;
8039 
8040     PetscCall(MatCreateVecs(Gmat, &diag, NULL));
8041     PetscCall(MatGetDiagonal(Gmat, diag));
8042     PetscCall(VecReciprocal(diag));
8043     PetscCall(VecSqrtAbs(diag));
8044     PetscCall(MatDiagonalScale(Gmat, diag, diag));
8045     PetscCall(VecDestroy(&diag));
8046   }
8047   PetscCall(MatViewFromOptions(Gmat, NULL, "-mat_graph_view"));
8048   if (filter >= 0) {
8049     PetscCall(MatFilter(Gmat, filter, PETSC_TRUE, PETSC_TRUE));
8050     PetscCall(MatViewFromOptions(Gmat, NULL, "-mat_filter_graph_view"));
8051   }
8052   *a_Gmat = Gmat;
8053   PetscFunctionReturn(PETSC_SUCCESS);
8054 }
8055 
8056 /*
8057     Special version for direct calls from Fortran
8058 */
8059 
8060 /* Change these macros so can be used in void function */
8061 /* Identical to PetscCallVoid, except it assigns to *_ierr */
8062 #undef PetscCall
8063 #define PetscCall(...) \
8064   do { \
8065     PetscErrorCode ierr_msv_mpiaij = __VA_ARGS__; \
8066     if (PetscUnlikely(ierr_msv_mpiaij)) { \
8067       *_ierr = PetscError(PETSC_COMM_SELF, __LINE__, PETSC_FUNCTION_NAME, __FILE__, ierr_msv_mpiaij, PETSC_ERROR_REPEAT, " "); \
8068       return; \
8069     } \
8070   } while (0)
8071 
8072 #undef SETERRQ
8073 #define SETERRQ(comm, ierr, ...) \
8074   do { \
8075     *_ierr = PetscError(comm, __LINE__, PETSC_FUNCTION_NAME, __FILE__, ierr, PETSC_ERROR_INITIAL, __VA_ARGS__); \
8076     return; \
8077   } while (0)
8078 
8079 #if defined(PETSC_HAVE_FORTRAN_CAPS)
8080   #define matsetvaluesmpiaij_ MATSETVALUESMPIAIJ
8081 #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
8082   #define matsetvaluesmpiaij_ matsetvaluesmpiaij
8083 #else
8084 #endif
8085 PETSC_EXTERN void matsetvaluesmpiaij_(Mat *mmat, PetscInt *mm, const PetscInt im[], PetscInt *mn, const PetscInt in[], const PetscScalar v[], InsertMode *maddv, PetscErrorCode *_ierr)
8086 {
8087   Mat         mat = *mmat;
8088   PetscInt    m = *mm, n = *mn;
8089   InsertMode  addv = *maddv;
8090   Mat_MPIAIJ *aij  = (Mat_MPIAIJ *)mat->data;
8091   PetscScalar value;
8092 
8093   MatCheckPreallocated(mat, 1);
8094   if (mat->insertmode == NOT_SET_VALUES) mat->insertmode = addv;
8095   else PetscCheck(mat->insertmode == addv, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Cannot mix add values and insert values");
8096   {
8097     PetscInt  i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
8098     PetscInt  cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
8099     PetscBool roworiented = aij->roworiented;
8100 
8101     /* Some Variables required in the macro */
8102     Mat         A     = aij->A;
8103     Mat_SeqAIJ *a     = (Mat_SeqAIJ *)A->data;
8104     PetscInt   *aimax = a->imax, *ai = a->i, *ailen = a->ilen, *aj = a->j;
8105     MatScalar  *aa;
8106     PetscBool   ignorezeroentries = ((a->ignorezeroentries && (addv == ADD_VALUES)) ? PETSC_TRUE : PETSC_FALSE);
8107     Mat         B                 = aij->B;
8108     Mat_SeqAIJ *b                 = (Mat_SeqAIJ *)B->data;
8109     PetscInt   *bimax = b->imax, *bi = b->i, *bilen = b->ilen, *bj = b->j, bm = aij->B->rmap->n, am = aij->A->rmap->n;
8110     MatScalar  *ba;
8111     /* This variable below is only for the PETSC_HAVE_VIENNACL or PETSC_HAVE_CUDA cases, but we define it in all cases because we
8112      * cannot use "#if defined" inside a macro. */
8113     PETSC_UNUSED PetscBool inserted = PETSC_FALSE;
8114 
8115     PetscInt  *rp1, *rp2, ii, nrow1, nrow2, _i, rmax1, rmax2, N, low1, high1, low2, high2, t, lastcol1, lastcol2;
8116     PetscInt   nonew = a->nonew;
8117     MatScalar *ap1, *ap2;
8118 
8119     PetscFunctionBegin;
8120     PetscCall(MatSeqAIJGetArray(A, &aa));
8121     PetscCall(MatSeqAIJGetArray(B, &ba));
8122     for (i = 0; i < m; i++) {
8123       if (im[i] < 0) continue;
8124       PetscCheck(im[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, im[i], mat->rmap->N - 1);
8125       if (im[i] >= rstart && im[i] < rend) {
8126         row      = im[i] - rstart;
8127         lastcol1 = -1;
8128         rp1      = aj + ai[row];
8129         ap1      = aa + ai[row];
8130         rmax1    = aimax[row];
8131         nrow1    = ailen[row];
8132         low1     = 0;
8133         high1    = nrow1;
8134         lastcol2 = -1;
8135         rp2      = bj + bi[row];
8136         ap2      = ba + bi[row];
8137         rmax2    = bimax[row];
8138         nrow2    = bilen[row];
8139         low2     = 0;
8140         high2    = nrow2;
8141 
8142         for (j = 0; j < n; j++) {
8143           if (roworiented) value = v[i * n + j];
8144           else value = v[i + j * m];
8145           if (ignorezeroentries && value == 0.0 && (addv == ADD_VALUES) && im[i] != in[j]) continue;
8146           if (in[j] >= cstart && in[j] < cend) {
8147             col = in[j] - cstart;
8148             MatSetValues_SeqAIJ_A_Private(row, col, value, addv, im[i], in[j]);
8149           } else if (in[j] < 0) continue;
8150           else if (PetscUnlikelyDebug(in[j] >= mat->cmap->N)) {
8151             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[j], mat->cmap->N - 1);
8152           } else {
8153             if (mat->was_assembled) {
8154               if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
8155 #if defined(PETSC_USE_CTABLE)
8156               PetscCall(PetscHMapIGetWithDefault(aij->colmap, in[j] + 1, 0, &col));
8157               col--;
8158 #else
8159               col = aij->colmap[in[j]] - 1;
8160 #endif
8161               if (col < 0 && !((Mat_SeqAIJ *)aij->A->data)->nonew) {
8162                 PetscCall(MatDisAssemble_MPIAIJ(mat));
8163                 col = in[j];
8164                 /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
8165                 B        = aij->B;
8166                 b        = (Mat_SeqAIJ *)B->data;
8167                 bimax    = b->imax;
8168                 bi       = b->i;
8169                 bilen    = b->ilen;
8170                 bj       = b->j;
8171                 rp2      = bj + bi[row];
8172                 ap2      = ba + bi[row];
8173                 rmax2    = bimax[row];
8174                 nrow2    = bilen[row];
8175                 low2     = 0;
8176                 high2    = nrow2;
8177                 bm       = aij->B->rmap->n;
8178                 ba       = b->a;
8179                 inserted = PETSC_FALSE;
8180               }
8181             } else col = in[j];
8182             MatSetValues_SeqAIJ_B_Private(row, col, value, addv, im[i], in[j]);
8183           }
8184         }
8185       } else if (!aij->donotstash) {
8186         if (roworiented) {
8187           PetscCall(MatStashValuesRow_Private(&mat->stash, im[i], n, in, v + i * n, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
8188         } else {
8189           PetscCall(MatStashValuesCol_Private(&mat->stash, im[i], n, in, v + i, m, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
8190         }
8191       }
8192     }
8193     PetscCall(MatSeqAIJRestoreArray(A, &aa));
8194     PetscCall(MatSeqAIJRestoreArray(B, &ba));
8195   }
8196   PetscFunctionReturnVoid();
8197 }
8198 
8199 /* Undefining these here since they were redefined from their original definition above! No
8200  * other PETSc functions should be defined past this point, as it is impossible to recover the
8201  * original definitions */
8202 #undef PetscCall
8203 #undef SETERRQ
8204