xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 75dbe01d0102b3d1f476e163bbbbf612f241f502)
1 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
2 #include <petsc/private/vecimpl.h>
3 #include <petsc/private/sfimpl.h>
4 #include <petsc/private/isimpl.h>
5 #include <petscblaslapack.h>
6 #include <petscsf.h>
7 #include <petsc/private/hashmapi.h>
8 
9 /* defines MatSetValues_MPI_Hash(), MatAssemblyBegin_MPI_Hash(), and MatAssemblyEnd_MPI_Hash() */
10 #define TYPE AIJ
11 #define TYPE_AIJ
12 #include "../src/mat/impls/aij/mpi/mpihashmat.h"
13 #undef TYPE
14 #undef TYPE_AIJ
15 
16 static PetscErrorCode MatReset_MPIAIJ(Mat mat)
17 {
18   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
19 
20   PetscFunctionBegin;
21   PetscCall(PetscLogObjectState((PetscObject)mat, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT, mat->rmap->N, mat->cmap->N));
22   PetscCall(MatStashDestroy_Private(&mat->stash));
23   PetscCall(VecDestroy(&aij->diag));
24   PetscCall(MatDestroy(&aij->A));
25   PetscCall(MatDestroy(&aij->B));
26 #if defined(PETSC_USE_CTABLE)
27   PetscCall(PetscHMapIDestroy(&aij->colmap));
28 #else
29   PetscCall(PetscFree(aij->colmap));
30 #endif
31   PetscCall(PetscFree(aij->garray));
32   PetscCall(VecDestroy(&aij->lvec));
33   PetscCall(VecScatterDestroy(&aij->Mvctx));
34   PetscCall(PetscFree2(aij->rowvalues, aij->rowindices));
35   PetscCall(PetscFree(aij->ld));
36   PetscFunctionReturn(PETSC_SUCCESS);
37 }
38 
39 static PetscErrorCode MatResetHash_MPIAIJ(Mat mat)
40 {
41   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
42   /* Save the nonzero states of the component matrices because those are what are used to determine
43     the nonzero state of mat */
44   PetscObjectState Astate = aij->A->nonzerostate, Bstate = aij->B->nonzerostate;
45 
46   PetscFunctionBegin;
47   PetscCall(MatReset_MPIAIJ(mat));
48   PetscCall(MatSetUp_MPI_Hash(mat));
49   aij->A->nonzerostate = ++Astate, aij->B->nonzerostate = ++Bstate;
50   PetscFunctionReturn(PETSC_SUCCESS);
51 }
52 
53 PetscErrorCode MatDestroy_MPIAIJ(Mat mat)
54 {
55   PetscFunctionBegin;
56   PetscCall(MatReset_MPIAIJ(mat));
57 
58   PetscCall(PetscFree(mat->data));
59 
60   /* may be created by MatCreateMPIAIJSumSeqAIJSymbolic */
61   PetscCall(PetscObjectCompose((PetscObject)mat, "MatMergeSeqsToMPI", NULL));
62 
63   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, NULL));
64   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatStoreValues_C", NULL));
65   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatRetrieveValues_C", NULL));
66   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatIsTranspose_C", NULL));
67   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetPreallocation_C", NULL));
68   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatResetPreallocation_C", NULL));
69   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatResetHash_C", NULL));
70   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetPreallocationCSR_C", NULL));
71   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDiagonalScaleLocal_C", NULL));
72   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpibaij_C", NULL));
73   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpisbaij_C", NULL));
74 #if defined(PETSC_HAVE_CUDA)
75   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijcusparse_C", NULL));
76 #endif
77 #if defined(PETSC_HAVE_HIP)
78   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijhipsparse_C", NULL));
79 #endif
80 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
81   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijkokkos_C", NULL));
82 #endif
83   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpidense_C", NULL));
84 #if defined(PETSC_HAVE_ELEMENTAL)
85   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_elemental_C", NULL));
86 #endif
87 #if defined(PETSC_HAVE_SCALAPACK) && (defined(PETSC_USE_REAL_SINGLE) || defined(PETSC_USE_REAL_DOUBLE))
88   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_scalapack_C", NULL));
89 #endif
90 #if defined(PETSC_HAVE_HYPRE)
91   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_hypre_C", NULL));
92   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_transpose_mpiaij_mpiaij_C", NULL));
93 #endif
94   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_is_C", NULL));
95   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_is_mpiaij_C", NULL));
96   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpiaij_C", NULL));
97   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetUseScalableIncreaseOverlap_C", NULL));
98   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijperm_C", NULL));
99   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijsell_C", NULL));
100 #if defined(PETSC_HAVE_MKL_SPARSE)
101   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijmkl_C", NULL));
102 #endif
103   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijcrl_C", NULL));
104   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_is_C", NULL));
105   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpisell_C", NULL));
106   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatSetPreallocationCOO_C", NULL));
107   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatSetValuesCOO_C", NULL));
108   PetscFunctionReturn(PETSC_SUCCESS);
109 }
110 
111 static PetscErrorCode MatGetRowIJ_MPIAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *m, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
112 {
113   Mat B;
114 
115   PetscFunctionBegin;
116   PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, &B));
117   PetscCall(PetscObjectCompose((PetscObject)A, "MatGetRowIJ_MPIAIJ", (PetscObject)B));
118   PetscCall(MatGetRowIJ(B, oshift, symmetric, inodecompressed, m, ia, ja, done));
119   PetscCall(MatDestroy(&B));
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 static PetscErrorCode MatRestoreRowIJ_MPIAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *m, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
124 {
125   Mat B;
126 
127   PetscFunctionBegin;
128   PetscCall(PetscObjectQuery((PetscObject)A, "MatGetRowIJ_MPIAIJ", (PetscObject *)&B));
129   PetscCall(MatRestoreRowIJ(B, oshift, symmetric, inodecompressed, m, ia, ja, done));
130   PetscCall(PetscObjectCompose((PetscObject)A, "MatGetRowIJ_MPIAIJ", NULL));
131   PetscFunctionReturn(PETSC_SUCCESS);
132 }
133 
134 /*MC
135    MATAIJ - MATAIJ = "aij" - A matrix type to be used for sparse matrices.
136 
137    This matrix type is identical to` MATSEQAIJ` when constructed with a single process communicator,
138    and `MATMPIAIJ` otherwise.  As a result, for single process communicators,
139   `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
140   for communicators controlling multiple processes.  It is recommended that you call both of
141   the above preallocation routines for simplicity.
142 
143    Options Database Key:
144 . -mat_type aij - sets the matrix type to `MATAIJ` during a call to `MatSetFromOptions()`
145 
146   Developer Note:
147   Level: beginner
148 
149     Subclasses include `MATAIJCUSPARSE`, `MATAIJPERM`, `MATAIJSELL`, `MATAIJMKL`, `MATAIJCRL`, `MATAIJKOKKOS`,and also automatically switches over to use inodes when
150    enough exist.
151 
152 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MATSEQAIJ`, `MatCreateAIJ()`, `MatCreateSeqAIJ()`, `MATSEQAIJ`, `MATMPIAIJ`
153 M*/
154 
155 /*MC
156    MATAIJCRL - MATAIJCRL = "aijcrl" - A matrix type to be used for sparse matrices.
157 
158    This matrix type is identical to `MATSEQAIJCRL` when constructed with a single process communicator,
159    and `MATMPIAIJCRL` otherwise.  As a result, for single process communicators,
160    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
161   for communicators controlling multiple processes.  It is recommended that you call both of
162   the above preallocation routines for simplicity.
163 
164    Options Database Key:
165 . -mat_type aijcrl - sets the matrix type to `MATMPIAIJCRL` during a call to `MatSetFromOptions()`
166 
167   Level: beginner
168 
169 .seealso: [](ch_matrices), `Mat`, `MatCreateMPIAIJCRL`, `MATSEQAIJCRL`, `MATMPIAIJCRL`, `MATSEQAIJCRL`, `MATMPIAIJCRL`
170 M*/
171 
172 static PetscErrorCode MatBindToCPU_MPIAIJ(Mat A, PetscBool flg)
173 {
174   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
175 
176   PetscFunctionBegin;
177 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP) || defined(PETSC_HAVE_VIENNACL)
178   A->boundtocpu = flg;
179 #endif
180   if (a->A) PetscCall(MatBindToCPU(a->A, flg));
181   if (a->B) PetscCall(MatBindToCPU(a->B, flg));
182 
183   /* In addition to binding the diagonal and off-diagonal matrices, bind the local vectors used for matrix-vector products.
184    * This maybe seems a little odd for a MatBindToCPU() call to do, but it makes no sense for the binding of these vectors
185    * to differ from the parent matrix. */
186   if (a->lvec) PetscCall(VecBindToCPU(a->lvec, flg));
187   if (a->diag) PetscCall(VecBindToCPU(a->diag, flg));
188   PetscFunctionReturn(PETSC_SUCCESS);
189 }
190 
191 static PetscErrorCode MatSetBlockSizes_MPIAIJ(Mat M, PetscInt rbs, PetscInt cbs)
192 {
193   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)M->data;
194 
195   PetscFunctionBegin;
196   if (mat->A) {
197     PetscCall(MatSetBlockSizes(mat->A, rbs, cbs));
198     PetscCall(MatSetBlockSizes(mat->B, rbs, 1));
199   }
200   PetscFunctionReturn(PETSC_SUCCESS);
201 }
202 
203 static PetscErrorCode MatFindNonzeroRows_MPIAIJ(Mat M, IS *keptrows)
204 {
205   Mat_MPIAIJ      *mat = (Mat_MPIAIJ *)M->data;
206   Mat_SeqAIJ      *a   = (Mat_SeqAIJ *)mat->A->data;
207   Mat_SeqAIJ      *b   = (Mat_SeqAIJ *)mat->B->data;
208   const PetscInt  *ia, *ib;
209   const MatScalar *aa, *bb, *aav, *bav;
210   PetscInt         na, nb, i, j, *rows, cnt = 0, n0rows;
211   PetscInt         m = M->rmap->n, rstart = M->rmap->rstart;
212 
213   PetscFunctionBegin;
214   *keptrows = NULL;
215 
216   ia = a->i;
217   ib = b->i;
218   PetscCall(MatSeqAIJGetArrayRead(mat->A, &aav));
219   PetscCall(MatSeqAIJGetArrayRead(mat->B, &bav));
220   for (i = 0; i < m; i++) {
221     na = ia[i + 1] - ia[i];
222     nb = ib[i + 1] - ib[i];
223     if (!na && !nb) {
224       cnt++;
225       goto ok1;
226     }
227     aa = aav + ia[i];
228     for (j = 0; j < na; j++) {
229       if (aa[j] != 0.0) goto ok1;
230     }
231     bb = PetscSafePointerPlusOffset(bav, ib[i]);
232     for (j = 0; j < nb; j++) {
233       if (bb[j] != 0.0) goto ok1;
234     }
235     cnt++;
236   ok1:;
237   }
238   PetscCallMPI(MPIU_Allreduce(&cnt, &n0rows, 1, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)M)));
239   if (!n0rows) {
240     PetscCall(MatSeqAIJRestoreArrayRead(mat->A, &aav));
241     PetscCall(MatSeqAIJRestoreArrayRead(mat->B, &bav));
242     PetscFunctionReturn(PETSC_SUCCESS);
243   }
244   PetscCall(PetscMalloc1(M->rmap->n - cnt, &rows));
245   cnt = 0;
246   for (i = 0; i < m; i++) {
247     na = ia[i + 1] - ia[i];
248     nb = ib[i + 1] - ib[i];
249     if (!na && !nb) continue;
250     aa = aav + ia[i];
251     for (j = 0; j < na; j++) {
252       if (aa[j] != 0.0) {
253         rows[cnt++] = rstart + i;
254         goto ok2;
255       }
256     }
257     bb = PetscSafePointerPlusOffset(bav, ib[i]);
258     for (j = 0; j < nb; j++) {
259       if (bb[j] != 0.0) {
260         rows[cnt++] = rstart + i;
261         goto ok2;
262       }
263     }
264   ok2:;
265   }
266   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)M), cnt, rows, PETSC_OWN_POINTER, keptrows));
267   PetscCall(MatSeqAIJRestoreArrayRead(mat->A, &aav));
268   PetscCall(MatSeqAIJRestoreArrayRead(mat->B, &bav));
269   PetscFunctionReturn(PETSC_SUCCESS);
270 }
271 
272 static PetscErrorCode MatDiagonalSet_MPIAIJ(Mat Y, Vec D, InsertMode is)
273 {
274   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)Y->data;
275   PetscBool   cong;
276 
277   PetscFunctionBegin;
278   PetscCall(MatHasCongruentLayouts(Y, &cong));
279   if (Y->assembled && cong) {
280     PetscCall(MatDiagonalSet(aij->A, D, is));
281   } else {
282     PetscCall(MatDiagonalSet_Default(Y, D, is));
283   }
284   PetscFunctionReturn(PETSC_SUCCESS);
285 }
286 
287 static PetscErrorCode MatFindZeroDiagonals_MPIAIJ(Mat M, IS *zrows)
288 {
289   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)M->data;
290   PetscInt    i, rstart, nrows, *rows;
291 
292   PetscFunctionBegin;
293   *zrows = NULL;
294   PetscCall(MatFindZeroDiagonals_SeqAIJ_Private(aij->A, &nrows, &rows));
295   PetscCall(MatGetOwnershipRange(M, &rstart, NULL));
296   for (i = 0; i < nrows; i++) rows[i] += rstart;
297   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)M), nrows, rows, PETSC_OWN_POINTER, zrows));
298   PetscFunctionReturn(PETSC_SUCCESS);
299 }
300 
301 static PetscErrorCode MatGetColumnReductions_MPIAIJ(Mat A, PetscInt type, PetscReal *reductions)
302 {
303   Mat_MPIAIJ        *aij = (Mat_MPIAIJ *)A->data;
304   PetscInt           i, m, n, *garray = aij->garray;
305   Mat_SeqAIJ        *a_aij = (Mat_SeqAIJ *)aij->A->data;
306   Mat_SeqAIJ        *b_aij = (Mat_SeqAIJ *)aij->B->data;
307   PetscReal         *work;
308   const PetscScalar *dummy;
309 
310   PetscFunctionBegin;
311   PetscCall(MatGetSize(A, &m, &n));
312   PetscCall(PetscCalloc1(n, &work));
313   PetscCall(MatSeqAIJGetArrayRead(aij->A, &dummy));
314   PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &dummy));
315   PetscCall(MatSeqAIJGetArrayRead(aij->B, &dummy));
316   PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &dummy));
317   if (type == NORM_2) {
318     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscAbsScalar(a_aij->a[i] * a_aij->a[i]);
319     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscAbsScalar(b_aij->a[i] * b_aij->a[i]);
320   } else if (type == NORM_1) {
321     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscAbsScalar(a_aij->a[i]);
322     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscAbsScalar(b_aij->a[i]);
323   } else if (type == NORM_INFINITY) {
324     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] = PetscMax(PetscAbsScalar(a_aij->a[i]), work[A->cmap->rstart + a_aij->j[i]]);
325     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] = PetscMax(PetscAbsScalar(b_aij->a[i]), work[garray[b_aij->j[i]]]);
326   } else if (type == REDUCTION_SUM_REALPART || type == REDUCTION_MEAN_REALPART) {
327     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscRealPart(a_aij->a[i]);
328     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscRealPart(b_aij->a[i]);
329   } else if (type == REDUCTION_SUM_IMAGINARYPART || type == REDUCTION_MEAN_IMAGINARYPART) {
330     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscImaginaryPart(a_aij->a[i]);
331     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscImaginaryPart(b_aij->a[i]);
332   } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unknown reduction type");
333   if (type == NORM_INFINITY) {
334     PetscCallMPI(MPIU_Allreduce(work, reductions, n, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)A)));
335   } else {
336     PetscCallMPI(MPIU_Allreduce(work, reductions, n, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)A)));
337   }
338   PetscCall(PetscFree(work));
339   if (type == NORM_2) {
340     for (i = 0; i < n; i++) reductions[i] = PetscSqrtReal(reductions[i]);
341   } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) {
342     for (i = 0; i < n; i++) reductions[i] /= m;
343   }
344   PetscFunctionReturn(PETSC_SUCCESS);
345 }
346 
347 static PetscErrorCode MatFindOffBlockDiagonalEntries_MPIAIJ(Mat A, IS *is)
348 {
349   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
350   IS              sis, gis;
351   const PetscInt *isis, *igis;
352   PetscInt        n, *iis, nsis, ngis, rstart, i;
353 
354   PetscFunctionBegin;
355   PetscCall(MatFindOffBlockDiagonalEntries(a->A, &sis));
356   PetscCall(MatFindNonzeroRows(a->B, &gis));
357   PetscCall(ISGetSize(gis, &ngis));
358   PetscCall(ISGetSize(sis, &nsis));
359   PetscCall(ISGetIndices(sis, &isis));
360   PetscCall(ISGetIndices(gis, &igis));
361 
362   PetscCall(PetscMalloc1(ngis + nsis, &iis));
363   PetscCall(PetscArraycpy(iis, igis, ngis));
364   PetscCall(PetscArraycpy(iis + ngis, isis, nsis));
365   n = ngis + nsis;
366   PetscCall(PetscSortRemoveDupsInt(&n, iis));
367   PetscCall(MatGetOwnershipRange(A, &rstart, NULL));
368   for (i = 0; i < n; i++) iis[i] += rstart;
369   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)A), n, iis, PETSC_OWN_POINTER, is));
370 
371   PetscCall(ISRestoreIndices(sis, &isis));
372   PetscCall(ISRestoreIndices(gis, &igis));
373   PetscCall(ISDestroy(&sis));
374   PetscCall(ISDestroy(&gis));
375   PetscFunctionReturn(PETSC_SUCCESS);
376 }
377 
378 /*
379   Local utility routine that creates a mapping from the global column
380 number to the local number in the off-diagonal part of the local
381 storage of the matrix.  When PETSC_USE_CTABLE is used this is scalable at
382 a slightly higher hash table cost; without it it is not scalable (each processor
383 has an order N integer array but is fast to access.
384 */
385 PetscErrorCode MatCreateColmap_MPIAIJ_Private(Mat mat)
386 {
387   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
388   PetscInt    n   = aij->B->cmap->n, i;
389 
390   PetscFunctionBegin;
391   PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
392 #if defined(PETSC_USE_CTABLE)
393   PetscCall(PetscHMapICreateWithSize(n, &aij->colmap));
394   for (i = 0; i < n; i++) PetscCall(PetscHMapISet(aij->colmap, aij->garray[i] + 1, i + 1));
395 #else
396   PetscCall(PetscCalloc1(mat->cmap->N + 1, &aij->colmap));
397   for (i = 0; i < n; i++) aij->colmap[aij->garray[i]] = i + 1;
398 #endif
399   PetscFunctionReturn(PETSC_SUCCESS);
400 }
401 
402 #define MatSetValues_SeqAIJ_A_Private(row, col, value, addv, orow, ocol) \
403   do { \
404     if (col <= lastcol1) low1 = 0; \
405     else high1 = nrow1; \
406     lastcol1 = col; \
407     while (high1 - low1 > 5) { \
408       t = (low1 + high1) / 2; \
409       if (rp1[t] > col) high1 = t; \
410       else low1 = t; \
411     } \
412     for (_i = low1; _i < high1; _i++) { \
413       if (rp1[_i] > col) break; \
414       if (rp1[_i] == col) { \
415         if (addv == ADD_VALUES) { \
416           ap1[_i] += value; \
417           /* Not sure LogFlops will slow down the code or not */ \
418           (void)PetscLogFlops(1.0); \
419         } else ap1[_i] = value; \
420         goto a_noinsert; \
421       } \
422     } \
423     if (value == 0.0 && ignorezeroentries && row != col) { \
424       low1  = 0; \
425       high1 = nrow1; \
426       goto a_noinsert; \
427     } \
428     if (nonew == 1) { \
429       low1  = 0; \
430       high1 = nrow1; \
431       goto a_noinsert; \
432     } \
433     PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
434     MatSeqXAIJReallocateAIJ(A, am, 1, nrow1, row, col, rmax1, aa, ai, aj, rp1, ap1, aimax, nonew, MatScalar); \
435     N = nrow1++ - 1; \
436     a->nz++; \
437     high1++; \
438     /* shift up all the later entries in this row */ \
439     PetscCall(PetscArraymove(rp1 + _i + 1, rp1 + _i, N - _i + 1)); \
440     PetscCall(PetscArraymove(ap1 + _i + 1, ap1 + _i, N - _i + 1)); \
441     rp1[_i] = col; \
442     ap1[_i] = value; \
443   a_noinsert:; \
444     ailen[row] = nrow1; \
445   } while (0)
446 
447 #define MatSetValues_SeqAIJ_B_Private(row, col, value, addv, orow, ocol) \
448   do { \
449     if (col <= lastcol2) low2 = 0; \
450     else high2 = nrow2; \
451     lastcol2 = col; \
452     while (high2 - low2 > 5) { \
453       t = (low2 + high2) / 2; \
454       if (rp2[t] > col) high2 = t; \
455       else low2 = t; \
456     } \
457     for (_i = low2; _i < high2; _i++) { \
458       if (rp2[_i] > col) break; \
459       if (rp2[_i] == col) { \
460         if (addv == ADD_VALUES) { \
461           ap2[_i] += value; \
462           (void)PetscLogFlops(1.0); \
463         } else ap2[_i] = value; \
464         goto b_noinsert; \
465       } \
466     } \
467     if (value == 0.0 && ignorezeroentries) { \
468       low2  = 0; \
469       high2 = nrow2; \
470       goto b_noinsert; \
471     } \
472     if (nonew == 1) { \
473       low2  = 0; \
474       high2 = nrow2; \
475       goto b_noinsert; \
476     } \
477     PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
478     MatSeqXAIJReallocateAIJ(B, bm, 1, nrow2, row, col, rmax2, ba, bi, bj, rp2, ap2, bimax, nonew, MatScalar); \
479     N = nrow2++ - 1; \
480     b->nz++; \
481     high2++; \
482     /* shift up all the later entries in this row */ \
483     PetscCall(PetscArraymove(rp2 + _i + 1, rp2 + _i, N - _i + 1)); \
484     PetscCall(PetscArraymove(ap2 + _i + 1, ap2 + _i, N - _i + 1)); \
485     rp2[_i] = col; \
486     ap2[_i] = value; \
487   b_noinsert:; \
488     bilen[row] = nrow2; \
489   } while (0)
490 
491 static PetscErrorCode MatSetValuesRow_MPIAIJ(Mat A, PetscInt row, const PetscScalar v[])
492 {
493   Mat_MPIAIJ  *mat = (Mat_MPIAIJ *)A->data;
494   Mat_SeqAIJ  *a = (Mat_SeqAIJ *)mat->A->data, *b = (Mat_SeqAIJ *)mat->B->data;
495   PetscInt     l, *garray                         = mat->garray, diag;
496   PetscScalar *aa, *ba;
497 
498   PetscFunctionBegin;
499   /* code only works for square matrices A */
500 
501   /* find size of row to the left of the diagonal part */
502   PetscCall(MatGetOwnershipRange(A, &diag, NULL));
503   row = row - diag;
504   for (l = 0; l < b->i[row + 1] - b->i[row]; l++) {
505     if (garray[b->j[b->i[row] + l]] > diag) break;
506   }
507   if (l) {
508     PetscCall(MatSeqAIJGetArray(mat->B, &ba));
509     PetscCall(PetscArraycpy(ba + b->i[row], v, l));
510     PetscCall(MatSeqAIJRestoreArray(mat->B, &ba));
511   }
512 
513   /* diagonal part */
514   if (a->i[row + 1] - a->i[row]) {
515     PetscCall(MatSeqAIJGetArray(mat->A, &aa));
516     PetscCall(PetscArraycpy(aa + a->i[row], v + l, a->i[row + 1] - a->i[row]));
517     PetscCall(MatSeqAIJRestoreArray(mat->A, &aa));
518   }
519 
520   /* right of diagonal part */
521   if (b->i[row + 1] - b->i[row] - l) {
522     PetscCall(MatSeqAIJGetArray(mat->B, &ba));
523     PetscCall(PetscArraycpy(ba + b->i[row] + l, v + l + a->i[row + 1] - a->i[row], b->i[row + 1] - b->i[row] - l));
524     PetscCall(MatSeqAIJRestoreArray(mat->B, &ba));
525   }
526   PetscFunctionReturn(PETSC_SUCCESS);
527 }
528 
529 PetscErrorCode MatSetValues_MPIAIJ(Mat mat, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode addv)
530 {
531   Mat_MPIAIJ *aij   = (Mat_MPIAIJ *)mat->data;
532   PetscScalar value = 0.0;
533   PetscInt    i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
534   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
535   PetscBool   roworiented = aij->roworiented;
536 
537   /* Some Variables required in the macro */
538   Mat         A     = aij->A;
539   Mat_SeqAIJ *a     = (Mat_SeqAIJ *)A->data;
540   PetscInt   *aimax = a->imax, *ai = a->i, *ailen = a->ilen, *aj = a->j;
541   PetscBool   ignorezeroentries = a->ignorezeroentries;
542   Mat         B                 = aij->B;
543   Mat_SeqAIJ *b                 = (Mat_SeqAIJ *)B->data;
544   PetscInt   *bimax = b->imax, *bi = b->i, *bilen = b->ilen, *bj = b->j, bm = aij->B->rmap->n, am = aij->A->rmap->n;
545   MatScalar  *aa, *ba;
546   PetscInt   *rp1, *rp2, ii, nrow1, nrow2, _i, rmax1, rmax2, N, low1, high1, low2, high2, t, lastcol1, lastcol2;
547   PetscInt    nonew;
548   MatScalar  *ap1, *ap2;
549 
550   PetscFunctionBegin;
551   PetscCall(MatSeqAIJGetArray(A, &aa));
552   PetscCall(MatSeqAIJGetArray(B, &ba));
553   for (i = 0; i < m; i++) {
554     if (im[i] < 0) continue;
555     PetscCheck(im[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, im[i], mat->rmap->N - 1);
556     if (im[i] >= rstart && im[i] < rend) {
557       row      = im[i] - rstart;
558       lastcol1 = -1;
559       rp1      = PetscSafePointerPlusOffset(aj, ai[row]);
560       ap1      = PetscSafePointerPlusOffset(aa, ai[row]);
561       rmax1    = aimax[row];
562       nrow1    = ailen[row];
563       low1     = 0;
564       high1    = nrow1;
565       lastcol2 = -1;
566       rp2      = PetscSafePointerPlusOffset(bj, bi[row]);
567       ap2      = PetscSafePointerPlusOffset(ba, bi[row]);
568       rmax2    = bimax[row];
569       nrow2    = bilen[row];
570       low2     = 0;
571       high2    = nrow2;
572 
573       for (j = 0; j < n; j++) {
574         if (v) value = roworiented ? v[i * n + j] : v[i + j * m];
575         if (ignorezeroentries && value == 0.0 && (addv == ADD_VALUES) && im[i] != in[j]) continue;
576         if (in[j] >= cstart && in[j] < cend) {
577           col   = in[j] - cstart;
578           nonew = a->nonew;
579           MatSetValues_SeqAIJ_A_Private(row, col, value, addv, im[i], in[j]);
580         } else if (in[j] < 0) {
581           continue;
582         } else {
583           PetscCheck(in[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[j], mat->cmap->N - 1);
584           if (mat->was_assembled) {
585             if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
586 #if defined(PETSC_USE_CTABLE)
587             PetscCall(PetscHMapIGetWithDefault(aij->colmap, in[j] + 1, 0, &col)); /* map global col ids to local ones */
588             col--;
589 #else
590             col = aij->colmap[in[j]] - 1;
591 #endif
592             if (col < 0 && !((Mat_SeqAIJ *)aij->B->data)->nonew) { /* col < 0 means in[j] is a new col for B */
593               PetscCall(MatDisAssemble_MPIAIJ(mat, PETSC_FALSE));  /* Change aij->B from reduced/local format to expanded/global format */
594               col = in[j];
595               /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
596               B     = aij->B;
597               b     = (Mat_SeqAIJ *)B->data;
598               bimax = b->imax;
599               bi    = b->i;
600               bilen = b->ilen;
601               bj    = b->j;
602               ba    = b->a;
603               rp2   = PetscSafePointerPlusOffset(bj, bi[row]);
604               ap2   = PetscSafePointerPlusOffset(ba, bi[row]);
605               rmax2 = bimax[row];
606               nrow2 = bilen[row];
607               low2  = 0;
608               high2 = nrow2;
609               bm    = aij->B->rmap->n;
610               ba    = b->a;
611             } else if (col < 0 && !(ignorezeroentries && value == 0.0)) {
612               PetscCheck(1 == ((Mat_SeqAIJ *)aij->B->data)->nonew, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", im[i], in[j]);
613               PetscCall(PetscInfo(mat, "Skipping of insertion of new nonzero location in off-diagonal portion of matrix %g(%" PetscInt_FMT ",%" PetscInt_FMT ")\n", (double)PetscRealPart(value), im[i], in[j]));
614             }
615           } else col = in[j];
616           nonew = b->nonew;
617           MatSetValues_SeqAIJ_B_Private(row, col, value, addv, im[i], in[j]);
618         }
619       }
620     } else {
621       PetscCheck(!mat->nooffprocentries, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Setting off process row %" PetscInt_FMT " even though MatSetOption(,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) was set", im[i]);
622       if (!aij->donotstash) {
623         mat->assembled = PETSC_FALSE;
624         if (roworiented) {
625           PetscCall(MatStashValuesRow_Private(&mat->stash, im[i], n, in, PetscSafePointerPlusOffset(v, i * n), (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
626         } else {
627           PetscCall(MatStashValuesCol_Private(&mat->stash, im[i], n, in, PetscSafePointerPlusOffset(v, i), m, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
628         }
629       }
630     }
631   }
632   PetscCall(MatSeqAIJRestoreArray(A, &aa)); /* aa, bb might have been free'd due to reallocation above. But we don't access them here */
633   PetscCall(MatSeqAIJRestoreArray(B, &ba));
634   PetscFunctionReturn(PETSC_SUCCESS);
635 }
636 
637 /*
638     This function sets the j and ilen arrays (of the diagonal and off-diagonal part) of an MPIAIJ-matrix.
639     The values in mat_i have to be sorted and the values in mat_j have to be sorted for each row (CSR-like).
640     No off-processor parts off the matrix are allowed here and mat->was_assembled has to be PETSC_FALSE.
641 */
642 PetscErrorCode MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(Mat mat, const PetscInt mat_j[], const PetscInt mat_i[])
643 {
644   Mat_MPIAIJ *aij    = (Mat_MPIAIJ *)mat->data;
645   Mat         A      = aij->A; /* diagonal part of the matrix */
646   Mat         B      = aij->B; /* off-diagonal part of the matrix */
647   Mat_SeqAIJ *a      = (Mat_SeqAIJ *)A->data;
648   Mat_SeqAIJ *b      = (Mat_SeqAIJ *)B->data;
649   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, col;
650   PetscInt   *ailen = a->ilen, *aj = a->j;
651   PetscInt   *bilen = b->ilen, *bj = b->j;
652   PetscInt    am          = aij->A->rmap->n, j;
653   PetscInt    diag_so_far = 0, dnz;
654   PetscInt    offd_so_far = 0, onz;
655 
656   PetscFunctionBegin;
657   /* Iterate over all rows of the matrix */
658   for (j = 0; j < am; j++) {
659     dnz = onz = 0;
660     /*  Iterate over all non-zero columns of the current row */
661     for (col = mat_i[j]; col < mat_i[j + 1]; col++) {
662       /* If column is in the diagonal */
663       if (mat_j[col] >= cstart && mat_j[col] < cend) {
664         aj[diag_so_far++] = mat_j[col] - cstart;
665         dnz++;
666       } else { /* off-diagonal entries */
667         bj[offd_so_far++] = mat_j[col];
668         onz++;
669       }
670     }
671     ailen[j] = dnz;
672     bilen[j] = onz;
673   }
674   PetscFunctionReturn(PETSC_SUCCESS);
675 }
676 
677 /*
678     This function sets the local j, a and ilen arrays (of the diagonal and off-diagonal part) of an MPIAIJ-matrix.
679     The values in mat_i have to be sorted and the values in mat_j have to be sorted for each row (CSR-like).
680     No off-processor parts off the matrix are allowed here, they are set at a later point by MatSetValues_MPIAIJ.
681     Also, mat->was_assembled has to be false, otherwise the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart;
682     would not be true and the more complex MatSetValues_MPIAIJ has to be used.
683 */
684 PetscErrorCode MatSetValues_MPIAIJ_CopyFromCSRFormat(Mat mat, const PetscInt mat_j[], const PetscInt mat_i[], const PetscScalar mat_a[])
685 {
686   Mat_MPIAIJ  *aij  = (Mat_MPIAIJ *)mat->data;
687   Mat          A    = aij->A; /* diagonal part of the matrix */
688   Mat          B    = aij->B; /* off-diagonal part of the matrix */
689   Mat_SeqAIJ  *aijd = (Mat_SeqAIJ *)aij->A->data, *aijo = (Mat_SeqAIJ *)aij->B->data;
690   Mat_SeqAIJ  *a      = (Mat_SeqAIJ *)A->data;
691   Mat_SeqAIJ  *b      = (Mat_SeqAIJ *)B->data;
692   PetscInt     cstart = mat->cmap->rstart, cend = mat->cmap->rend;
693   PetscInt    *ailen = a->ilen, *aj = a->j;
694   PetscInt    *bilen = b->ilen, *bj = b->j;
695   PetscInt     am          = aij->A->rmap->n, j;
696   PetscInt    *full_diag_i = aijd->i, *full_offd_i = aijo->i; /* These variables can also include non-local elements, which are set at a later point. */
697   PetscInt     col, dnz_row, onz_row, rowstart_diag, rowstart_offd;
698   PetscScalar *aa = a->a, *ba = b->a;
699 
700   PetscFunctionBegin;
701   /* Iterate over all rows of the matrix */
702   for (j = 0; j < am; j++) {
703     dnz_row = onz_row = 0;
704     rowstart_offd     = full_offd_i[j];
705     rowstart_diag     = full_diag_i[j];
706     /*  Iterate over all non-zero columns of the current row */
707     for (col = mat_i[j]; col < mat_i[j + 1]; col++) {
708       /* If column is in the diagonal */
709       if (mat_j[col] >= cstart && mat_j[col] < cend) {
710         aj[rowstart_diag + dnz_row] = mat_j[col] - cstart;
711         aa[rowstart_diag + dnz_row] = mat_a[col];
712         dnz_row++;
713       } else { /* off-diagonal entries */
714         bj[rowstart_offd + onz_row] = mat_j[col];
715         ba[rowstart_offd + onz_row] = mat_a[col];
716         onz_row++;
717       }
718     }
719     ailen[j] = dnz_row;
720     bilen[j] = onz_row;
721   }
722   PetscFunctionReturn(PETSC_SUCCESS);
723 }
724 
725 static PetscErrorCode MatGetValues_MPIAIJ(Mat mat, PetscInt m, const PetscInt idxm[], PetscInt n, const PetscInt idxn[], PetscScalar v[])
726 {
727   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
728   PetscInt    i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
729   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
730 
731   PetscFunctionBegin;
732   for (i = 0; i < m; i++) {
733     if (idxm[i] < 0) continue; /* negative row */
734     PetscCheck(idxm[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, idxm[i], mat->rmap->N - 1);
735     PetscCheck(idxm[i] >= rstart && idxm[i] < rend, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only local values currently supported, row requested %" PetscInt_FMT " range [%" PetscInt_FMT " %" PetscInt_FMT ")", idxm[i], rstart, rend);
736     row = idxm[i] - rstart;
737     for (j = 0; j < n; j++) {
738       if (idxn[j] < 0) continue; /* negative column */
739       PetscCheck(idxn[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, idxn[j], mat->cmap->N - 1);
740       if (idxn[j] >= cstart && idxn[j] < cend) {
741         col = idxn[j] - cstart;
742         PetscCall(MatGetValues(aij->A, 1, &row, 1, &col, v + i * n + j));
743       } else {
744         if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
745 #if defined(PETSC_USE_CTABLE)
746         PetscCall(PetscHMapIGetWithDefault(aij->colmap, idxn[j] + 1, 0, &col));
747         col--;
748 #else
749         col = aij->colmap[idxn[j]] - 1;
750 #endif
751         if ((col < 0) || (aij->garray[col] != idxn[j])) *(v + i * n + j) = 0.0;
752         else PetscCall(MatGetValues(aij->B, 1, &row, 1, &col, v + i * n + j));
753       }
754     }
755   }
756   PetscFunctionReturn(PETSC_SUCCESS);
757 }
758 
759 static PetscErrorCode MatAssemblyBegin_MPIAIJ(Mat mat, MatAssemblyType mode)
760 {
761   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
762   PetscInt    nstash, reallocs;
763 
764   PetscFunctionBegin;
765   if (aij->donotstash || mat->nooffprocentries) PetscFunctionReturn(PETSC_SUCCESS);
766 
767   PetscCall(MatStashScatterBegin_Private(mat, &mat->stash, mat->rmap->range));
768   PetscCall(MatStashGetInfo_Private(&mat->stash, &nstash, &reallocs));
769   PetscCall(PetscInfo(aij->A, "Stash has %" PetscInt_FMT " entries, uses %" PetscInt_FMT " mallocs.\n", nstash, reallocs));
770   PetscFunctionReturn(PETSC_SUCCESS);
771 }
772 
773 PetscErrorCode MatAssemblyEnd_MPIAIJ(Mat mat, MatAssemblyType mode)
774 {
775   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *)mat->data;
776   PetscMPIInt  n;
777   PetscInt     i, j, rstart, ncols, flg;
778   PetscInt    *row, *col;
779   PetscBool    all_assembled;
780   PetscScalar *val;
781 
782   /* do not use 'b = (Mat_SeqAIJ*)aij->B->data' as B can be reset in disassembly */
783 
784   PetscFunctionBegin;
785   if (!aij->donotstash && !mat->nooffprocentries) {
786     while (1) {
787       PetscCall(MatStashScatterGetMesg_Private(&mat->stash, &n, &row, &col, &val, &flg));
788       if (!flg) break;
789 
790       for (i = 0; i < n;) {
791         /* Now identify the consecutive vals belonging to the same row */
792         for (j = i, rstart = row[j]; j < n; j++) {
793           if (row[j] != rstart) break;
794         }
795         if (j < n) ncols = j - i;
796         else ncols = n - i;
797         /* Now assemble all these values with a single function call */
798         PetscCall(MatSetValues_MPIAIJ(mat, 1, row + i, ncols, col + i, val + i, mat->insertmode));
799         i = j;
800       }
801     }
802     PetscCall(MatStashScatterEnd_Private(&mat->stash));
803   }
804 #if defined(PETSC_HAVE_DEVICE)
805   if (mat->offloadmask == PETSC_OFFLOAD_CPU) aij->A->offloadmask = PETSC_OFFLOAD_CPU;
806   /* We call MatBindToCPU() on aij->A and aij->B here, because if MatBindToCPU_MPIAIJ() is called before assembly, it cannot bind these. */
807   if (mat->boundtocpu) {
808     PetscCall(MatBindToCPU(aij->A, PETSC_TRUE));
809     PetscCall(MatBindToCPU(aij->B, PETSC_TRUE));
810   }
811 #endif
812   PetscCall(MatAssemblyBegin(aij->A, mode));
813   PetscCall(MatAssemblyEnd(aij->A, mode));
814 
815   /* determine if any process has disassembled, if so we must
816      also disassemble ourself, in order that we may reassemble. */
817   /*
818      if nonzero structure of submatrix B cannot change then we know that
819      no process disassembled thus we can skip this stuff
820   */
821   if (!((Mat_SeqAIJ *)aij->B->data)->nonew) {
822     PetscCallMPI(MPIU_Allreduce(&mat->was_assembled, &all_assembled, 1, MPI_C_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
823     if (mat->was_assembled && !all_assembled) { /* mat on this rank has reduced off-diag B with local col ids, but globally it does not */
824       PetscCall(MatDisAssemble_MPIAIJ(mat, PETSC_FALSE));
825     }
826   }
827   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) PetscCall(MatSetUpMultiply_MPIAIJ(mat));
828   PetscCall(MatSetOption(aij->B, MAT_USE_INODES, PETSC_FALSE));
829 #if defined(PETSC_HAVE_DEVICE)
830   if (mat->offloadmask == PETSC_OFFLOAD_CPU && aij->B->offloadmask != PETSC_OFFLOAD_UNALLOCATED) aij->B->offloadmask = PETSC_OFFLOAD_CPU;
831 #endif
832   PetscCall(MatAssemblyBegin(aij->B, mode));
833   PetscCall(MatAssemblyEnd(aij->B, mode));
834 
835   PetscCall(PetscFree2(aij->rowvalues, aij->rowindices));
836 
837   aij->rowvalues = NULL;
838 
839   PetscCall(VecDestroy(&aij->diag));
840 
841   /* if no new nonzero locations are allowed in matrix then only set the matrix state the first time through */
842   if ((!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) || !((Mat_SeqAIJ *)aij->A->data)->nonew) {
843     PetscObjectState state = aij->A->nonzerostate + aij->B->nonzerostate;
844     PetscCallMPI(MPIU_Allreduce(&state, &mat->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)mat)));
845   }
846 #if defined(PETSC_HAVE_DEVICE)
847   mat->offloadmask = PETSC_OFFLOAD_BOTH;
848 #endif
849   PetscFunctionReturn(PETSC_SUCCESS);
850 }
851 
852 static PetscErrorCode MatZeroEntries_MPIAIJ(Mat A)
853 {
854   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
855 
856   PetscFunctionBegin;
857   PetscCall(MatZeroEntries(l->A));
858   PetscCall(MatZeroEntries(l->B));
859   PetscFunctionReturn(PETSC_SUCCESS);
860 }
861 
862 static PetscErrorCode MatZeroRows_MPIAIJ(Mat A, PetscInt N, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
863 {
864   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)A->data;
865   PetscInt   *lrows;
866   PetscInt    r, len;
867   PetscBool   cong;
868 
869   PetscFunctionBegin;
870   /* get locally owned rows */
871   PetscCall(MatZeroRowsMapLocal_Private(A, N, rows, &len, &lrows));
872   PetscCall(MatHasCongruentLayouts(A, &cong));
873   /* fix right-hand side if needed */
874   if (x && b) {
875     const PetscScalar *xx;
876     PetscScalar       *bb;
877 
878     PetscCheck(cong, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need matching row/col layout");
879     PetscCall(VecGetArrayRead(x, &xx));
880     PetscCall(VecGetArray(b, &bb));
881     for (r = 0; r < len; ++r) bb[lrows[r]] = diag * xx[lrows[r]];
882     PetscCall(VecRestoreArrayRead(x, &xx));
883     PetscCall(VecRestoreArray(b, &bb));
884   }
885 
886   if (diag != 0.0 && cong) {
887     PetscCall(MatZeroRows(mat->A, len, lrows, diag, NULL, NULL));
888     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
889   } else if (diag != 0.0) { /* non-square or non congruent layouts -> if keepnonzeropattern is false, we allow for new insertion */
890     Mat_SeqAIJ *aijA = (Mat_SeqAIJ *)mat->A->data;
891     Mat_SeqAIJ *aijB = (Mat_SeqAIJ *)mat->B->data;
892     PetscInt    nnwA, nnwB;
893     PetscBool   nnzA, nnzB;
894 
895     nnwA = aijA->nonew;
896     nnwB = aijB->nonew;
897     nnzA = aijA->keepnonzeropattern;
898     nnzB = aijB->keepnonzeropattern;
899     if (!nnzA) {
900       PetscCall(PetscInfo(mat->A, "Requested to not keep the pattern and add a nonzero diagonal; may encounter reallocations on diagonal block.\n"));
901       aijA->nonew = 0;
902     }
903     if (!nnzB) {
904       PetscCall(PetscInfo(mat->B, "Requested to not keep the pattern and add a nonzero diagonal; may encounter reallocations on off-diagonal block.\n"));
905       aijB->nonew = 0;
906     }
907     /* Must zero here before the next loop */
908     PetscCall(MatZeroRows(mat->A, len, lrows, 0.0, NULL, NULL));
909     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
910     for (r = 0; r < len; ++r) {
911       const PetscInt row = lrows[r] + A->rmap->rstart;
912       if (row >= A->cmap->N) continue;
913       PetscCall(MatSetValues(A, 1, &row, 1, &row, &diag, INSERT_VALUES));
914     }
915     aijA->nonew = nnwA;
916     aijB->nonew = nnwB;
917   } else {
918     PetscCall(MatZeroRows(mat->A, len, lrows, 0.0, NULL, NULL));
919     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
920   }
921   PetscCall(PetscFree(lrows));
922   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
923   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
924 
925   /* only change matrix nonzero state if pattern was allowed to be changed */
926   if (!((Mat_SeqAIJ *)mat->A->data)->keepnonzeropattern || !((Mat_SeqAIJ *)mat->A->data)->nonew) {
927     PetscObjectState state = mat->A->nonzerostate + mat->B->nonzerostate;
928     PetscCallMPI(MPIU_Allreduce(&state, &A->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)A)));
929   }
930   PetscFunctionReturn(PETSC_SUCCESS);
931 }
932 
933 static PetscErrorCode MatZeroRowsColumns_MPIAIJ(Mat A, PetscInt N, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
934 {
935   Mat_MPIAIJ        *l = (Mat_MPIAIJ *)A->data;
936   PetscInt           n = A->rmap->n;
937   PetscInt           i, j, r, m, len = 0;
938   PetscInt          *lrows, *owners = A->rmap->range;
939   PetscMPIInt        p = 0;
940   PetscSFNode       *rrows;
941   PetscSF            sf;
942   const PetscScalar *xx;
943   PetscScalar       *bb, *mask, *aij_a;
944   Vec                xmask, lmask;
945   Mat_SeqAIJ        *aij = (Mat_SeqAIJ *)l->B->data;
946   const PetscInt    *aj, *ii, *ridx;
947   PetscScalar       *aa;
948 
949   PetscFunctionBegin;
950   /* Create SF where leaves are input rows and roots are owned rows */
951   PetscCall(PetscMalloc1(n, &lrows));
952   for (r = 0; r < n; ++r) lrows[r] = -1;
953   PetscCall(PetscMalloc1(N, &rrows));
954   for (r = 0; r < N; ++r) {
955     const PetscInt idx = rows[r];
956     PetscCheck(idx >= 0 && A->rmap->N > idx, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range [0,%" PetscInt_FMT ")", idx, A->rmap->N);
957     if (idx < owners[p] || owners[p + 1] <= idx) { /* short-circuit the search if the last p owns this row too */
958       PetscCall(PetscLayoutFindOwner(A->rmap, idx, &p));
959     }
960     rrows[r].rank  = p;
961     rrows[r].index = rows[r] - owners[p];
962   }
963   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
964   PetscCall(PetscSFSetGraph(sf, n, N, NULL, PETSC_OWN_POINTER, rrows, PETSC_OWN_POINTER));
965   /* Collect flags for rows to be zeroed */
966   PetscCall(PetscSFReduceBegin(sf, MPIU_INT, (PetscInt *)rows, lrows, MPI_LOR));
967   PetscCall(PetscSFReduceEnd(sf, MPIU_INT, (PetscInt *)rows, lrows, MPI_LOR));
968   PetscCall(PetscSFDestroy(&sf));
969   /* Compress and put in row numbers */
970   for (r = 0; r < n; ++r)
971     if (lrows[r] >= 0) lrows[len++] = r;
972   /* zero diagonal part of matrix */
973   PetscCall(MatZeroRowsColumns(l->A, len, lrows, diag, x, b));
974   /* handle off-diagonal part of matrix */
975   PetscCall(MatCreateVecs(A, &xmask, NULL));
976   PetscCall(VecDuplicate(l->lvec, &lmask));
977   PetscCall(VecGetArray(xmask, &bb));
978   for (i = 0; i < len; i++) bb[lrows[i]] = 1;
979   PetscCall(VecRestoreArray(xmask, &bb));
980   PetscCall(VecScatterBegin(l->Mvctx, xmask, lmask, ADD_VALUES, SCATTER_FORWARD));
981   PetscCall(VecScatterEnd(l->Mvctx, xmask, lmask, ADD_VALUES, SCATTER_FORWARD));
982   PetscCall(VecDestroy(&xmask));
983   if (x && b) { /* this code is buggy when the row and column layout don't match */
984     PetscBool cong;
985 
986     PetscCall(MatHasCongruentLayouts(A, &cong));
987     PetscCheck(cong, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need matching row/col layout");
988     PetscCall(VecScatterBegin(l->Mvctx, x, l->lvec, INSERT_VALUES, SCATTER_FORWARD));
989     PetscCall(VecScatterEnd(l->Mvctx, x, l->lvec, INSERT_VALUES, SCATTER_FORWARD));
990     PetscCall(VecGetArrayRead(l->lvec, &xx));
991     PetscCall(VecGetArray(b, &bb));
992   }
993   PetscCall(VecGetArray(lmask, &mask));
994   /* remove zeroed rows of off-diagonal matrix */
995   PetscCall(MatSeqAIJGetArray(l->B, &aij_a));
996   ii = aij->i;
997   for (i = 0; i < len; i++) PetscCall(PetscArrayzero(PetscSafePointerPlusOffset(aij_a, ii[lrows[i]]), ii[lrows[i] + 1] - ii[lrows[i]]));
998   /* loop over all elements of off process part of matrix zeroing removed columns*/
999   if (aij->compressedrow.use) {
1000     m    = aij->compressedrow.nrows;
1001     ii   = aij->compressedrow.i;
1002     ridx = aij->compressedrow.rindex;
1003     for (i = 0; i < m; i++) {
1004       n  = ii[i + 1] - ii[i];
1005       aj = aij->j + ii[i];
1006       aa = aij_a + ii[i];
1007 
1008       for (j = 0; j < n; j++) {
1009         if (PetscAbsScalar(mask[*aj])) {
1010           if (b) bb[*ridx] -= *aa * xx[*aj];
1011           *aa = 0.0;
1012         }
1013         aa++;
1014         aj++;
1015       }
1016       ridx++;
1017     }
1018   } else { /* do not use compressed row format */
1019     m = l->B->rmap->n;
1020     for (i = 0; i < m; i++) {
1021       n  = ii[i + 1] - ii[i];
1022       aj = aij->j + ii[i];
1023       aa = aij_a + ii[i];
1024       for (j = 0; j < n; j++) {
1025         if (PetscAbsScalar(mask[*aj])) {
1026           if (b) bb[i] -= *aa * xx[*aj];
1027           *aa = 0.0;
1028         }
1029         aa++;
1030         aj++;
1031       }
1032     }
1033   }
1034   if (x && b) {
1035     PetscCall(VecRestoreArray(b, &bb));
1036     PetscCall(VecRestoreArrayRead(l->lvec, &xx));
1037   }
1038   PetscCall(MatSeqAIJRestoreArray(l->B, &aij_a));
1039   PetscCall(VecRestoreArray(lmask, &mask));
1040   PetscCall(VecDestroy(&lmask));
1041   PetscCall(PetscFree(lrows));
1042 
1043   /* only change matrix nonzero state if pattern was allowed to be changed */
1044   if (!((Mat_SeqAIJ *)l->A->data)->nonew) {
1045     PetscObjectState state = l->A->nonzerostate + l->B->nonzerostate;
1046     PetscCallMPI(MPIU_Allreduce(&state, &A->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)A)));
1047   }
1048   PetscFunctionReturn(PETSC_SUCCESS);
1049 }
1050 
1051 static PetscErrorCode MatMult_MPIAIJ(Mat A, Vec xx, Vec yy)
1052 {
1053   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1054   PetscInt    nt;
1055   VecScatter  Mvctx = a->Mvctx;
1056 
1057   PetscFunctionBegin;
1058   PetscCall(VecGetLocalSize(xx, &nt));
1059   PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt);
1060   PetscCall(VecScatterBegin(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1061   PetscUseTypeMethod(a->A, mult, xx, yy);
1062   PetscCall(VecScatterEnd(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1063   PetscUseTypeMethod(a->B, multadd, a->lvec, yy, yy);
1064   PetscFunctionReturn(PETSC_SUCCESS);
1065 }
1066 
1067 static PetscErrorCode MatMultDiagonalBlock_MPIAIJ(Mat A, Vec bb, Vec xx)
1068 {
1069   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1070 
1071   PetscFunctionBegin;
1072   PetscCall(MatMultDiagonalBlock(a->A, bb, xx));
1073   PetscFunctionReturn(PETSC_SUCCESS);
1074 }
1075 
1076 static PetscErrorCode MatMultAdd_MPIAIJ(Mat A, Vec xx, Vec yy, Vec zz)
1077 {
1078   Mat_MPIAIJ *a     = (Mat_MPIAIJ *)A->data;
1079   VecScatter  Mvctx = a->Mvctx;
1080 
1081   PetscFunctionBegin;
1082   PetscCall(VecScatterBegin(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1083   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
1084   PetscCall(VecScatterEnd(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1085   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
1086   PetscFunctionReturn(PETSC_SUCCESS);
1087 }
1088 
1089 static PetscErrorCode MatMultTranspose_MPIAIJ(Mat A, Vec xx, Vec yy)
1090 {
1091   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1092 
1093   PetscFunctionBegin;
1094   /* do nondiagonal part */
1095   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
1096   /* do local part */
1097   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
1098   /* add partial results together */
1099   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
1100   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
1101   PetscFunctionReturn(PETSC_SUCCESS);
1102 }
1103 
1104 static PetscErrorCode MatIsTranspose_MPIAIJ(Mat Amat, Mat Bmat, PetscReal tol, PetscBool *f)
1105 {
1106   MPI_Comm    comm;
1107   Mat_MPIAIJ *Aij = (Mat_MPIAIJ *)Amat->data, *Bij = (Mat_MPIAIJ *)Bmat->data;
1108   Mat         Adia = Aij->A, Bdia = Bij->A, Aoff, Boff, *Aoffs, *Boffs;
1109   IS          Me, Notme;
1110   PetscInt    M, N, first, last, *notme, i;
1111   PetscBool   lf;
1112   PetscMPIInt size;
1113 
1114   PetscFunctionBegin;
1115   /* Easy test: symmetric diagonal block */
1116   PetscCall(MatIsTranspose(Adia, Bdia, tol, &lf));
1117   PetscCallMPI(MPIU_Allreduce(&lf, f, 1, MPI_C_BOOL, MPI_LAND, PetscObjectComm((PetscObject)Amat)));
1118   if (!*f) PetscFunctionReturn(PETSC_SUCCESS);
1119   PetscCall(PetscObjectGetComm((PetscObject)Amat, &comm));
1120   PetscCallMPI(MPI_Comm_size(comm, &size));
1121   if (size == 1) PetscFunctionReturn(PETSC_SUCCESS);
1122 
1123   /* Hard test: off-diagonal block. This takes a MatCreateSubMatrix. */
1124   PetscCall(MatGetSize(Amat, &M, &N));
1125   PetscCall(MatGetOwnershipRange(Amat, &first, &last));
1126   PetscCall(PetscMalloc1(N - last + first, &notme));
1127   for (i = 0; i < first; i++) notme[i] = i;
1128   for (i = last; i < M; i++) notme[i - last + first] = i;
1129   PetscCall(ISCreateGeneral(MPI_COMM_SELF, N - last + first, notme, PETSC_COPY_VALUES, &Notme));
1130   PetscCall(ISCreateStride(MPI_COMM_SELF, last - first, first, 1, &Me));
1131   PetscCall(MatCreateSubMatrices(Amat, 1, &Me, &Notme, MAT_INITIAL_MATRIX, &Aoffs));
1132   Aoff = Aoffs[0];
1133   PetscCall(MatCreateSubMatrices(Bmat, 1, &Notme, &Me, MAT_INITIAL_MATRIX, &Boffs));
1134   Boff = Boffs[0];
1135   PetscCall(MatIsTranspose(Aoff, Boff, tol, f));
1136   PetscCall(MatDestroyMatrices(1, &Aoffs));
1137   PetscCall(MatDestroyMatrices(1, &Boffs));
1138   PetscCall(ISDestroy(&Me));
1139   PetscCall(ISDestroy(&Notme));
1140   PetscCall(PetscFree(notme));
1141   PetscFunctionReturn(PETSC_SUCCESS);
1142 }
1143 
1144 static PetscErrorCode MatMultTransposeAdd_MPIAIJ(Mat A, Vec xx, Vec yy, Vec zz)
1145 {
1146   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1147 
1148   PetscFunctionBegin;
1149   /* do nondiagonal part */
1150   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
1151   /* do local part */
1152   PetscCall((*a->A->ops->multtransposeadd)(a->A, xx, yy, zz));
1153   /* add partial results together */
1154   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, zz, ADD_VALUES, SCATTER_REVERSE));
1155   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, zz, ADD_VALUES, SCATTER_REVERSE));
1156   PetscFunctionReturn(PETSC_SUCCESS);
1157 }
1158 
1159 /*
1160   This only works correctly for square matrices where the subblock A->A is the
1161    diagonal block
1162 */
1163 static PetscErrorCode MatGetDiagonal_MPIAIJ(Mat A, Vec v)
1164 {
1165   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1166 
1167   PetscFunctionBegin;
1168   PetscCheck(A->rmap->N == A->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Supports only square matrix where A->A is diag block");
1169   PetscCheck(A->rmap->rstart == A->cmap->rstart && A->rmap->rend == A->cmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "row partition must equal col partition");
1170   PetscCall(MatGetDiagonal(a->A, v));
1171   PetscFunctionReturn(PETSC_SUCCESS);
1172 }
1173 
1174 static PetscErrorCode MatScale_MPIAIJ(Mat A, PetscScalar aa)
1175 {
1176   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1177 
1178   PetscFunctionBegin;
1179   PetscCall(MatScale(a->A, aa));
1180   PetscCall(MatScale(a->B, aa));
1181   PetscFunctionReturn(PETSC_SUCCESS);
1182 }
1183 
1184 static PetscErrorCode MatView_MPIAIJ_Binary(Mat mat, PetscViewer viewer)
1185 {
1186   Mat_MPIAIJ        *aij    = (Mat_MPIAIJ *)mat->data;
1187   Mat_SeqAIJ        *A      = (Mat_SeqAIJ *)aij->A->data;
1188   Mat_SeqAIJ        *B      = (Mat_SeqAIJ *)aij->B->data;
1189   const PetscInt    *garray = aij->garray;
1190   const PetscScalar *aa, *ba;
1191   PetscInt           header[4], M, N, m, rs, cs, cnt, i, ja, jb;
1192   PetscInt64         nz, hnz;
1193   PetscInt          *rowlens;
1194   PetscInt          *colidxs;
1195   PetscScalar       *matvals;
1196   PetscMPIInt        rank;
1197 
1198   PetscFunctionBegin;
1199   PetscCall(PetscViewerSetUp(viewer));
1200 
1201   M  = mat->rmap->N;
1202   N  = mat->cmap->N;
1203   m  = mat->rmap->n;
1204   rs = mat->rmap->rstart;
1205   cs = mat->cmap->rstart;
1206   nz = A->nz + B->nz;
1207 
1208   /* write matrix header */
1209   header[0] = MAT_FILE_CLASSID;
1210   header[1] = M;
1211   header[2] = N;
1212   PetscCallMPI(MPI_Reduce(&nz, &hnz, 1, MPIU_INT64, MPI_SUM, 0, PetscObjectComm((PetscObject)mat)));
1213   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
1214   if (rank == 0) PetscCall(PetscIntCast(hnz, &header[3]));
1215   PetscCall(PetscViewerBinaryWrite(viewer, header, 4, PETSC_INT));
1216 
1217   /* fill in and store row lengths  */
1218   PetscCall(PetscMalloc1(m, &rowlens));
1219   for (i = 0; i < m; i++) rowlens[i] = A->i[i + 1] - A->i[i] + B->i[i + 1] - B->i[i];
1220   PetscCall(PetscViewerBinaryWriteAll(viewer, rowlens, m, rs, M, PETSC_INT));
1221   PetscCall(PetscFree(rowlens));
1222 
1223   /* fill in and store column indices */
1224   PetscCall(PetscMalloc1(nz, &colidxs));
1225   for (cnt = 0, i = 0; i < m; i++) {
1226     for (jb = B->i[i]; jb < B->i[i + 1]; jb++) {
1227       if (garray[B->j[jb]] > cs) break;
1228       colidxs[cnt++] = garray[B->j[jb]];
1229     }
1230     for (ja = A->i[i]; ja < A->i[i + 1]; ja++) colidxs[cnt++] = A->j[ja] + cs;
1231     for (; jb < B->i[i + 1]; jb++) colidxs[cnt++] = garray[B->j[jb]];
1232   }
1233   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt64_FMT, cnt, nz);
1234   PetscCall(PetscViewerBinaryWriteAll(viewer, colidxs, nz, PETSC_DETERMINE, PETSC_DETERMINE, PETSC_INT));
1235   PetscCall(PetscFree(colidxs));
1236 
1237   /* fill in and store nonzero values */
1238   PetscCall(MatSeqAIJGetArrayRead(aij->A, &aa));
1239   PetscCall(MatSeqAIJGetArrayRead(aij->B, &ba));
1240   PetscCall(PetscMalloc1(nz, &matvals));
1241   for (cnt = 0, i = 0; i < m; i++) {
1242     for (jb = B->i[i]; jb < B->i[i + 1]; jb++) {
1243       if (garray[B->j[jb]] > cs) break;
1244       matvals[cnt++] = ba[jb];
1245     }
1246     for (ja = A->i[i]; ja < A->i[i + 1]; ja++) matvals[cnt++] = aa[ja];
1247     for (; jb < B->i[i + 1]; jb++) matvals[cnt++] = ba[jb];
1248   }
1249   PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &aa));
1250   PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &ba));
1251   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_LIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt64_FMT, cnt, nz);
1252   PetscCall(PetscViewerBinaryWriteAll(viewer, matvals, nz, PETSC_DETERMINE, PETSC_DETERMINE, PETSC_SCALAR));
1253   PetscCall(PetscFree(matvals));
1254 
1255   /* write block size option to the viewer's .info file */
1256   PetscCall(MatView_Binary_BlockSizes(mat, viewer));
1257   PetscFunctionReturn(PETSC_SUCCESS);
1258 }
1259 
1260 #include <petscdraw.h>
1261 static PetscErrorCode MatView_MPIAIJ_ASCIIorDraworSocket(Mat mat, PetscViewer viewer)
1262 {
1263   Mat_MPIAIJ       *aij  = (Mat_MPIAIJ *)mat->data;
1264   PetscMPIInt       rank = aij->rank, size = aij->size;
1265   PetscBool         isdraw, isascii, isbinary;
1266   PetscViewer       sviewer;
1267   PetscViewerFormat format;
1268 
1269   PetscFunctionBegin;
1270   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1271   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
1272   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1273   if (isascii) {
1274     PetscCall(PetscViewerGetFormat(viewer, &format));
1275     if (format == PETSC_VIEWER_LOAD_BALANCE) {
1276       PetscInt i, nmax = 0, nmin = PETSC_INT_MAX, navg = 0, *nz, nzlocal = ((Mat_SeqAIJ *)aij->A->data)->nz + ((Mat_SeqAIJ *)aij->B->data)->nz;
1277       PetscCall(PetscMalloc1(size, &nz));
1278       PetscCallMPI(MPI_Allgather(&nzlocal, 1, MPIU_INT, nz, 1, MPIU_INT, PetscObjectComm((PetscObject)mat)));
1279       for (i = 0; i < size; i++) {
1280         nmax = PetscMax(nmax, nz[i]);
1281         nmin = PetscMin(nmin, nz[i]);
1282         navg += nz[i];
1283       }
1284       PetscCall(PetscFree(nz));
1285       navg = navg / size;
1286       PetscCall(PetscViewerASCIIPrintf(viewer, "Load Balance - Nonzeros: Min %" PetscInt_FMT "  avg %" PetscInt_FMT "  max %" PetscInt_FMT "\n", nmin, navg, nmax));
1287       PetscFunctionReturn(PETSC_SUCCESS);
1288     }
1289     PetscCall(PetscViewerGetFormat(viewer, &format));
1290     if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
1291       MatInfo   info;
1292       PetscInt *inodes = NULL;
1293 
1294       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
1295       PetscCall(MatGetInfo(mat, MAT_LOCAL, &info));
1296       PetscCall(MatInodeGetInodeSizes(aij->A, NULL, &inodes, NULL));
1297       PetscCall(PetscViewerASCIIPushSynchronized(viewer));
1298       if (!inodes) {
1299         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %g, not using I-node routines\n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated,
1300                                                      info.memory));
1301       } else {
1302         PetscCall(
1303           PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %g, using I-node routines\n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated, info.memory));
1304       }
1305       PetscCall(MatGetInfo(aij->A, MAT_LOCAL, &info));
1306       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] on-diagonal part: nz %" PetscInt_FMT " \n", rank, (PetscInt)info.nz_used));
1307       PetscCall(MatGetInfo(aij->B, MAT_LOCAL, &info));
1308       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] off-diagonal part: nz %" PetscInt_FMT " \n", rank, (PetscInt)info.nz_used));
1309       PetscCall(PetscViewerFlush(viewer));
1310       PetscCall(PetscViewerASCIIPopSynchronized(viewer));
1311       PetscCall(PetscViewerASCIIPrintf(viewer, "Information on VecScatter used in matrix-vector product: \n"));
1312       PetscCall(VecScatterView(aij->Mvctx, viewer));
1313       PetscFunctionReturn(PETSC_SUCCESS);
1314     } else if (format == PETSC_VIEWER_ASCII_INFO) {
1315       PetscInt inodecount, inodelimit, *inodes;
1316       PetscCall(MatInodeGetInodeSizes(aij->A, &inodecount, &inodes, &inodelimit));
1317       if (inodes) {
1318         PetscCall(PetscViewerASCIIPrintf(viewer, "using I-node (on process 0) routines: found %" PetscInt_FMT " nodes, limit used is %" PetscInt_FMT "\n", inodecount, inodelimit));
1319       } else {
1320         PetscCall(PetscViewerASCIIPrintf(viewer, "not using I-node (on process 0) routines\n"));
1321       }
1322       PetscFunctionReturn(PETSC_SUCCESS);
1323     } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO) {
1324       PetscFunctionReturn(PETSC_SUCCESS);
1325     }
1326   } else if (isbinary) {
1327     if (size == 1) {
1328       PetscCall(PetscObjectSetName((PetscObject)aij->A, ((PetscObject)mat)->name));
1329       PetscCall(MatView(aij->A, viewer));
1330     } else {
1331       PetscCall(MatView_MPIAIJ_Binary(mat, viewer));
1332     }
1333     PetscFunctionReturn(PETSC_SUCCESS);
1334   } else if (isascii && size == 1) {
1335     PetscCall(PetscObjectSetName((PetscObject)aij->A, ((PetscObject)mat)->name));
1336     PetscCall(MatView(aij->A, viewer));
1337     PetscFunctionReturn(PETSC_SUCCESS);
1338   } else if (isdraw) {
1339     PetscDraw draw;
1340     PetscBool isnull;
1341     PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1342     PetscCall(PetscDrawIsNull(draw, &isnull));
1343     if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1344   }
1345 
1346   { /* assemble the entire matrix onto first processor */
1347     Mat A = NULL, Av;
1348     IS  isrow, iscol;
1349 
1350     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat), rank == 0 ? mat->rmap->N : 0, 0, 1, &isrow));
1351     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat), rank == 0 ? mat->cmap->N : 0, 0, 1, &iscol));
1352     PetscCall(MatCreateSubMatrix(mat, isrow, iscol, MAT_INITIAL_MATRIX, &A));
1353     PetscCall(MatMPIAIJGetSeqAIJ(A, &Av, NULL, NULL));
1354     /*  The commented code uses MatCreateSubMatrices instead */
1355     /*
1356     Mat *AA, A = NULL, Av;
1357     IS  isrow,iscol;
1358 
1359     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat),rank == 0 ? mat->rmap->N : 0,0,1,&isrow));
1360     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat),rank == 0 ? mat->cmap->N : 0,0,1,&iscol));
1361     PetscCall(MatCreateSubMatrices(mat,1,&isrow,&iscol,MAT_INITIAL_MATRIX,&AA));
1362     if (rank == 0) {
1363        PetscCall(PetscObjectReference((PetscObject)AA[0]));
1364        A    = AA[0];
1365        Av   = AA[0];
1366     }
1367     PetscCall(MatDestroySubMatrices(1,&AA));
1368 */
1369     PetscCall(ISDestroy(&iscol));
1370     PetscCall(ISDestroy(&isrow));
1371     /*
1372        Everyone has to call to draw the matrix since the graphics waits are
1373        synchronized across all processors that share the PetscDraw object
1374     */
1375     PetscCall(PetscViewerGetSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
1376     if (rank == 0) {
1377       if (((PetscObject)mat)->name) PetscCall(PetscObjectSetName((PetscObject)Av, ((PetscObject)mat)->name));
1378       PetscCall(MatView_SeqAIJ(Av, sviewer));
1379     }
1380     PetscCall(PetscViewerRestoreSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
1381     PetscCall(MatDestroy(&A));
1382   }
1383   PetscFunctionReturn(PETSC_SUCCESS);
1384 }
1385 
1386 PetscErrorCode MatView_MPIAIJ(Mat mat, PetscViewer viewer)
1387 {
1388   PetscBool isascii, isdraw, issocket, isbinary;
1389 
1390   PetscFunctionBegin;
1391   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
1392   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1393   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1394   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERSOCKET, &issocket));
1395   if (isascii || isdraw || isbinary || issocket) PetscCall(MatView_MPIAIJ_ASCIIorDraworSocket(mat, viewer));
1396   PetscFunctionReturn(PETSC_SUCCESS);
1397 }
1398 
1399 static PetscErrorCode MatSOR_MPIAIJ(Mat matin, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1400 {
1401   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)matin->data;
1402   Vec         bb1 = NULL;
1403   PetscBool   hasop;
1404 
1405   PetscFunctionBegin;
1406   if (flag == SOR_APPLY_UPPER) {
1407     PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1408     PetscFunctionReturn(PETSC_SUCCESS);
1409   }
1410 
1411   if (its > 1 || ~flag & SOR_ZERO_INITIAL_GUESS || flag & SOR_EISENSTAT) PetscCall(VecDuplicate(bb, &bb1));
1412 
1413   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP) {
1414     if (flag & SOR_ZERO_INITIAL_GUESS) {
1415       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1416       its--;
1417     }
1418 
1419     while (its--) {
1420       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1421       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1422 
1423       /* update rhs: bb1 = bb - B*x */
1424       PetscCall(VecScale(mat->lvec, -1.0));
1425       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1426 
1427       /* local sweep */
1428       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_SYMMETRIC_SWEEP, fshift, lits, 1, xx));
1429     }
1430   } else if (flag & SOR_LOCAL_FORWARD_SWEEP) {
1431     if (flag & SOR_ZERO_INITIAL_GUESS) {
1432       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1433       its--;
1434     }
1435     while (its--) {
1436       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1437       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1438 
1439       /* update rhs: bb1 = bb - B*x */
1440       PetscCall(VecScale(mat->lvec, -1.0));
1441       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1442 
1443       /* local sweep */
1444       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_FORWARD_SWEEP, fshift, lits, 1, xx));
1445     }
1446   } else if (flag & SOR_LOCAL_BACKWARD_SWEEP) {
1447     if (flag & SOR_ZERO_INITIAL_GUESS) {
1448       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1449       its--;
1450     }
1451     while (its--) {
1452       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1453       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1454 
1455       /* update rhs: bb1 = bb - B*x */
1456       PetscCall(VecScale(mat->lvec, -1.0));
1457       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1458 
1459       /* local sweep */
1460       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_BACKWARD_SWEEP, fshift, lits, 1, xx));
1461     }
1462   } else if (flag & SOR_EISENSTAT) {
1463     Vec xx1;
1464 
1465     PetscCall(VecDuplicate(bb, &xx1));
1466     PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, (MatSORType)(SOR_ZERO_INITIAL_GUESS | SOR_LOCAL_BACKWARD_SWEEP), fshift, lits, 1, xx));
1467 
1468     PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1469     PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1470     if (!mat->diag) {
1471       PetscCall(MatCreateVecs(matin, &mat->diag, NULL));
1472       PetscCall(MatGetDiagonal(matin, mat->diag));
1473     }
1474     PetscCall(MatHasOperation(matin, MATOP_MULT_DIAGONAL_BLOCK, &hasop));
1475     if (hasop) {
1476       PetscCall(MatMultDiagonalBlock(matin, xx, bb1));
1477     } else {
1478       PetscCall(VecPointwiseMult(bb1, mat->diag, xx));
1479     }
1480     PetscCall(VecAYPX(bb1, (omega - 2.0) / omega, bb));
1481 
1482     PetscCall(MatMultAdd(mat->B, mat->lvec, bb1, bb1));
1483 
1484     /* local sweep */
1485     PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, (MatSORType)(SOR_ZERO_INITIAL_GUESS | SOR_LOCAL_FORWARD_SWEEP), fshift, lits, 1, xx1));
1486     PetscCall(VecAXPY(xx, 1.0, xx1));
1487     PetscCall(VecDestroy(&xx1));
1488   } else SETERRQ(PetscObjectComm((PetscObject)matin), PETSC_ERR_SUP, "Parallel SOR not supported");
1489 
1490   PetscCall(VecDestroy(&bb1));
1491 
1492   matin->factorerrortype = mat->A->factorerrortype;
1493   PetscFunctionReturn(PETSC_SUCCESS);
1494 }
1495 
1496 static PetscErrorCode MatPermute_MPIAIJ(Mat A, IS rowp, IS colp, Mat *B)
1497 {
1498   Mat             aA, aB, Aperm;
1499   const PetscInt *rwant, *cwant, *gcols, *ai, *bi, *aj, *bj;
1500   PetscScalar    *aa, *ba;
1501   PetscInt        i, j, m, n, ng, anz, bnz, *dnnz, *onnz, *tdnnz, *tonnz, *rdest, *cdest, *work, *gcdest;
1502   PetscSF         rowsf, sf;
1503   IS              parcolp = NULL;
1504   PetscBool       done;
1505 
1506   PetscFunctionBegin;
1507   PetscCall(MatGetLocalSize(A, &m, &n));
1508   PetscCall(ISGetIndices(rowp, &rwant));
1509   PetscCall(ISGetIndices(colp, &cwant));
1510   PetscCall(PetscMalloc3(PetscMax(m, n), &work, m, &rdest, n, &cdest));
1511 
1512   /* Invert row permutation to find out where my rows should go */
1513   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &rowsf));
1514   PetscCall(PetscSFSetGraphLayout(rowsf, A->rmap, A->rmap->n, NULL, PETSC_OWN_POINTER, rwant));
1515   PetscCall(PetscSFSetFromOptions(rowsf));
1516   for (i = 0; i < m; i++) work[i] = A->rmap->rstart + i;
1517   PetscCall(PetscSFReduceBegin(rowsf, MPIU_INT, work, rdest, MPI_REPLACE));
1518   PetscCall(PetscSFReduceEnd(rowsf, MPIU_INT, work, rdest, MPI_REPLACE));
1519 
1520   /* Invert column permutation to find out where my columns should go */
1521   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1522   PetscCall(PetscSFSetGraphLayout(sf, A->cmap, A->cmap->n, NULL, PETSC_OWN_POINTER, cwant));
1523   PetscCall(PetscSFSetFromOptions(sf));
1524   for (i = 0; i < n; i++) work[i] = A->cmap->rstart + i;
1525   PetscCall(PetscSFReduceBegin(sf, MPIU_INT, work, cdest, MPI_REPLACE));
1526   PetscCall(PetscSFReduceEnd(sf, MPIU_INT, work, cdest, MPI_REPLACE));
1527   PetscCall(PetscSFDestroy(&sf));
1528 
1529   PetscCall(ISRestoreIndices(rowp, &rwant));
1530   PetscCall(ISRestoreIndices(colp, &cwant));
1531   PetscCall(MatMPIAIJGetSeqAIJ(A, &aA, &aB, &gcols));
1532 
1533   /* Find out where my gcols should go */
1534   PetscCall(MatGetSize(aB, NULL, &ng));
1535   PetscCall(PetscMalloc1(ng, &gcdest));
1536   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1537   PetscCall(PetscSFSetGraphLayout(sf, A->cmap, ng, NULL, PETSC_OWN_POINTER, gcols));
1538   PetscCall(PetscSFSetFromOptions(sf));
1539   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, cdest, gcdest, MPI_REPLACE));
1540   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, cdest, gcdest, MPI_REPLACE));
1541   PetscCall(PetscSFDestroy(&sf));
1542 
1543   PetscCall(PetscCalloc4(m, &dnnz, m, &onnz, m, &tdnnz, m, &tonnz));
1544   PetscCall(MatGetRowIJ(aA, 0, PETSC_FALSE, PETSC_FALSE, &anz, &ai, &aj, &done));
1545   PetscCall(MatGetRowIJ(aB, 0, PETSC_FALSE, PETSC_FALSE, &bnz, &bi, &bj, &done));
1546   for (i = 0; i < m; i++) {
1547     PetscInt    row = rdest[i];
1548     PetscMPIInt rowner;
1549     PetscCall(PetscLayoutFindOwner(A->rmap, row, &rowner));
1550     for (j = ai[i]; j < ai[i + 1]; j++) {
1551       PetscInt    col = cdest[aj[j]];
1552       PetscMPIInt cowner;
1553       PetscCall(PetscLayoutFindOwner(A->cmap, col, &cowner)); /* Could build an index for the columns to eliminate this search */
1554       if (rowner == cowner) dnnz[i]++;
1555       else onnz[i]++;
1556     }
1557     for (j = bi[i]; j < bi[i + 1]; j++) {
1558       PetscInt    col = gcdest[bj[j]];
1559       PetscMPIInt cowner;
1560       PetscCall(PetscLayoutFindOwner(A->cmap, col, &cowner));
1561       if (rowner == cowner) dnnz[i]++;
1562       else onnz[i]++;
1563     }
1564   }
1565   PetscCall(PetscSFBcastBegin(rowsf, MPIU_INT, dnnz, tdnnz, MPI_REPLACE));
1566   PetscCall(PetscSFBcastEnd(rowsf, MPIU_INT, dnnz, tdnnz, MPI_REPLACE));
1567   PetscCall(PetscSFBcastBegin(rowsf, MPIU_INT, onnz, tonnz, MPI_REPLACE));
1568   PetscCall(PetscSFBcastEnd(rowsf, MPIU_INT, onnz, tonnz, MPI_REPLACE));
1569   PetscCall(PetscSFDestroy(&rowsf));
1570 
1571   PetscCall(MatCreateAIJ(PetscObjectComm((PetscObject)A), A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N, 0, tdnnz, 0, tonnz, &Aperm));
1572   PetscCall(MatSeqAIJGetArray(aA, &aa));
1573   PetscCall(MatSeqAIJGetArray(aB, &ba));
1574   for (i = 0; i < m; i++) {
1575     PetscInt *acols = dnnz, *bcols = onnz; /* Repurpose now-unneeded arrays */
1576     PetscInt  j0, rowlen;
1577     rowlen = ai[i + 1] - ai[i];
1578     for (j0 = j = 0; j < rowlen; j0 = j) { /* rowlen could be larger than number of rows m, so sum in batches */
1579       for (; j < PetscMin(rowlen, j0 + m); j++) acols[j - j0] = cdest[aj[ai[i] + j]];
1580       PetscCall(MatSetValues(Aperm, 1, &rdest[i], j - j0, acols, aa + ai[i] + j0, INSERT_VALUES));
1581     }
1582     rowlen = bi[i + 1] - bi[i];
1583     for (j0 = j = 0; j < rowlen; j0 = j) {
1584       for (; j < PetscMin(rowlen, j0 + m); j++) bcols[j - j0] = gcdest[bj[bi[i] + j]];
1585       PetscCall(MatSetValues(Aperm, 1, &rdest[i], j - j0, bcols, ba + bi[i] + j0, INSERT_VALUES));
1586     }
1587   }
1588   PetscCall(MatAssemblyBegin(Aperm, MAT_FINAL_ASSEMBLY));
1589   PetscCall(MatAssemblyEnd(Aperm, MAT_FINAL_ASSEMBLY));
1590   PetscCall(MatRestoreRowIJ(aA, 0, PETSC_FALSE, PETSC_FALSE, &anz, &ai, &aj, &done));
1591   PetscCall(MatRestoreRowIJ(aB, 0, PETSC_FALSE, PETSC_FALSE, &bnz, &bi, &bj, &done));
1592   PetscCall(MatSeqAIJRestoreArray(aA, &aa));
1593   PetscCall(MatSeqAIJRestoreArray(aB, &ba));
1594   PetscCall(PetscFree4(dnnz, onnz, tdnnz, tonnz));
1595   PetscCall(PetscFree3(work, rdest, cdest));
1596   PetscCall(PetscFree(gcdest));
1597   if (parcolp) PetscCall(ISDestroy(&colp));
1598   *B = Aperm;
1599   PetscFunctionReturn(PETSC_SUCCESS);
1600 }
1601 
1602 static PetscErrorCode MatGetGhosts_MPIAIJ(Mat mat, PetscInt *nghosts, const PetscInt *ghosts[])
1603 {
1604   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1605 
1606   PetscFunctionBegin;
1607   PetscCall(MatGetSize(aij->B, NULL, nghosts));
1608   if (ghosts) *ghosts = aij->garray;
1609   PetscFunctionReturn(PETSC_SUCCESS);
1610 }
1611 
1612 static PetscErrorCode MatGetInfo_MPIAIJ(Mat matin, MatInfoType flag, MatInfo *info)
1613 {
1614   Mat_MPIAIJ    *mat = (Mat_MPIAIJ *)matin->data;
1615   Mat            A = mat->A, B = mat->B;
1616   PetscLogDouble isend[5], irecv[5];
1617 
1618   PetscFunctionBegin;
1619   info->block_size = 1.0;
1620   PetscCall(MatGetInfo(A, MAT_LOCAL, info));
1621 
1622   isend[0] = info->nz_used;
1623   isend[1] = info->nz_allocated;
1624   isend[2] = info->nz_unneeded;
1625   isend[3] = info->memory;
1626   isend[4] = info->mallocs;
1627 
1628   PetscCall(MatGetInfo(B, MAT_LOCAL, info));
1629 
1630   isend[0] += info->nz_used;
1631   isend[1] += info->nz_allocated;
1632   isend[2] += info->nz_unneeded;
1633   isend[3] += info->memory;
1634   isend[4] += info->mallocs;
1635   if (flag == MAT_LOCAL) {
1636     info->nz_used      = isend[0];
1637     info->nz_allocated = isend[1];
1638     info->nz_unneeded  = isend[2];
1639     info->memory       = isend[3];
1640     info->mallocs      = isend[4];
1641   } else if (flag == MAT_GLOBAL_MAX) {
1642     PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_MAX, PetscObjectComm((PetscObject)matin)));
1643 
1644     info->nz_used      = irecv[0];
1645     info->nz_allocated = irecv[1];
1646     info->nz_unneeded  = irecv[2];
1647     info->memory       = irecv[3];
1648     info->mallocs      = irecv[4];
1649   } else if (flag == MAT_GLOBAL_SUM) {
1650     PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_SUM, PetscObjectComm((PetscObject)matin)));
1651 
1652     info->nz_used      = irecv[0];
1653     info->nz_allocated = irecv[1];
1654     info->nz_unneeded  = irecv[2];
1655     info->memory       = irecv[3];
1656     info->mallocs      = irecv[4];
1657   }
1658   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1659   info->fill_ratio_needed = 0;
1660   info->factor_mallocs    = 0;
1661   PetscFunctionReturn(PETSC_SUCCESS);
1662 }
1663 
1664 PetscErrorCode MatSetOption_MPIAIJ(Mat A, MatOption op, PetscBool flg)
1665 {
1666   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1667 
1668   PetscFunctionBegin;
1669   switch (op) {
1670   case MAT_NEW_NONZERO_LOCATIONS:
1671   case MAT_NEW_NONZERO_ALLOCATION_ERR:
1672   case MAT_UNUSED_NONZERO_LOCATION_ERR:
1673   case MAT_KEEP_NONZERO_PATTERN:
1674   case MAT_NEW_NONZERO_LOCATION_ERR:
1675   case MAT_USE_INODES:
1676   case MAT_IGNORE_ZERO_ENTRIES:
1677   case MAT_FORM_EXPLICIT_TRANSPOSE:
1678     MatCheckPreallocated(A, 1);
1679     PetscCall(MatSetOption(a->A, op, flg));
1680     PetscCall(MatSetOption(a->B, op, flg));
1681     break;
1682   case MAT_ROW_ORIENTED:
1683     MatCheckPreallocated(A, 1);
1684     a->roworiented = flg;
1685 
1686     PetscCall(MatSetOption(a->A, op, flg));
1687     PetscCall(MatSetOption(a->B, op, flg));
1688     break;
1689   case MAT_IGNORE_OFF_PROC_ENTRIES:
1690     a->donotstash = flg;
1691     break;
1692   /* Symmetry flags are handled directly by MatSetOption() and they don't affect preallocation */
1693   case MAT_SPD:
1694   case MAT_SYMMETRIC:
1695   case MAT_STRUCTURALLY_SYMMETRIC:
1696   case MAT_HERMITIAN:
1697   case MAT_SYMMETRY_ETERNAL:
1698   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
1699   case MAT_SPD_ETERNAL:
1700     /* if the diagonal matrix is square it inherits some of the properties above */
1701     if (a->A && A->rmap->n == A->cmap->n) PetscCall(MatSetOption(a->A, op, flg));
1702     break;
1703   case MAT_SUBMAT_SINGLEIS:
1704     A->submat_singleis = flg;
1705     break;
1706   default:
1707     break;
1708   }
1709   PetscFunctionReturn(PETSC_SUCCESS);
1710 }
1711 
1712 PetscErrorCode MatGetRow_MPIAIJ(Mat matin, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1713 {
1714   Mat_MPIAIJ  *mat = (Mat_MPIAIJ *)matin->data;
1715   PetscScalar *vworkA, *vworkB, **pvA, **pvB, *v_p;
1716   PetscInt     i, *cworkA, *cworkB, **pcA, **pcB, cstart = matin->cmap->rstart;
1717   PetscInt     nztot, nzA, nzB, lrow, rstart = matin->rmap->rstart, rend = matin->rmap->rend;
1718   PetscInt    *cmap, *idx_p;
1719 
1720   PetscFunctionBegin;
1721   PetscCheck(!mat->getrowactive, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Already active");
1722   mat->getrowactive = PETSC_TRUE;
1723 
1724   if (!mat->rowvalues && (idx || v)) {
1725     /*
1726         allocate enough space to hold information from the longest row.
1727     */
1728     Mat_SeqAIJ *Aa = (Mat_SeqAIJ *)mat->A->data, *Ba = (Mat_SeqAIJ *)mat->B->data;
1729     PetscInt    max = 1, tmp;
1730     for (i = 0; i < matin->rmap->n; i++) {
1731       tmp = Aa->i[i + 1] - Aa->i[i] + Ba->i[i + 1] - Ba->i[i];
1732       if (max < tmp) max = tmp;
1733     }
1734     PetscCall(PetscMalloc2(max, &mat->rowvalues, max, &mat->rowindices));
1735   }
1736 
1737   PetscCheck(row >= rstart && row < rend, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Only local rows");
1738   lrow = row - rstart;
1739 
1740   pvA = &vworkA;
1741   pcA = &cworkA;
1742   pvB = &vworkB;
1743   pcB = &cworkB;
1744   if (!v) {
1745     pvA = NULL;
1746     pvB = NULL;
1747   }
1748   if (!idx) {
1749     pcA = NULL;
1750     if (!v) pcB = NULL;
1751   }
1752   PetscCall((*mat->A->ops->getrow)(mat->A, lrow, &nzA, pcA, pvA));
1753   PetscCall((*mat->B->ops->getrow)(mat->B, lrow, &nzB, pcB, pvB));
1754   nztot = nzA + nzB;
1755 
1756   cmap = mat->garray;
1757   if (v || idx) {
1758     if (nztot) {
1759       /* Sort by increasing column numbers, assuming A and B already sorted */
1760       PetscInt imark = -1;
1761       if (v) {
1762         *v = v_p = mat->rowvalues;
1763         for (i = 0; i < nzB; i++) {
1764           if (cmap[cworkB[i]] < cstart) v_p[i] = vworkB[i];
1765           else break;
1766         }
1767         imark = i;
1768         for (i = 0; i < nzA; i++) v_p[imark + i] = vworkA[i];
1769         for (i = imark; i < nzB; i++) v_p[nzA + i] = vworkB[i];
1770       }
1771       if (idx) {
1772         *idx = idx_p = mat->rowindices;
1773         if (imark > -1) {
1774           for (i = 0; i < imark; i++) idx_p[i] = cmap[cworkB[i]];
1775         } else {
1776           for (i = 0; i < nzB; i++) {
1777             if (cmap[cworkB[i]] < cstart) idx_p[i] = cmap[cworkB[i]];
1778             else break;
1779           }
1780           imark = i;
1781         }
1782         for (i = 0; i < nzA; i++) idx_p[imark + i] = cstart + cworkA[i];
1783         for (i = imark; i < nzB; i++) idx_p[nzA + i] = cmap[cworkB[i]];
1784       }
1785     } else {
1786       if (idx) *idx = NULL;
1787       if (v) *v = NULL;
1788     }
1789   }
1790   *nz = nztot;
1791   PetscCall((*mat->A->ops->restorerow)(mat->A, lrow, &nzA, pcA, pvA));
1792   PetscCall((*mat->B->ops->restorerow)(mat->B, lrow, &nzB, pcB, pvB));
1793   PetscFunctionReturn(PETSC_SUCCESS);
1794 }
1795 
1796 PetscErrorCode MatRestoreRow_MPIAIJ(Mat mat, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1797 {
1798   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1799 
1800   PetscFunctionBegin;
1801   PetscCheck(aij->getrowactive, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "MatGetRow() must be called first");
1802   aij->getrowactive = PETSC_FALSE;
1803   PetscFunctionReturn(PETSC_SUCCESS);
1804 }
1805 
1806 static PetscErrorCode MatNorm_MPIAIJ(Mat mat, NormType type, PetscReal *norm)
1807 {
1808   Mat_MPIAIJ      *aij  = (Mat_MPIAIJ *)mat->data;
1809   Mat_SeqAIJ      *amat = (Mat_SeqAIJ *)aij->A->data, *bmat = (Mat_SeqAIJ *)aij->B->data;
1810   PetscInt         i, j;
1811   PetscReal        sum = 0.0;
1812   const MatScalar *v, *amata, *bmata;
1813 
1814   PetscFunctionBegin;
1815   if (aij->size == 1) {
1816     PetscCall(MatNorm(aij->A, type, norm));
1817   } else {
1818     PetscCall(MatSeqAIJGetArrayRead(aij->A, &amata));
1819     PetscCall(MatSeqAIJGetArrayRead(aij->B, &bmata));
1820     if (type == NORM_FROBENIUS) {
1821       v = amata;
1822       for (i = 0; i < amat->nz; i++) {
1823         sum += PetscRealPart(PetscConj(*v) * (*v));
1824         v++;
1825       }
1826       v = bmata;
1827       for (i = 0; i < bmat->nz; i++) {
1828         sum += PetscRealPart(PetscConj(*v) * (*v));
1829         v++;
1830       }
1831       PetscCallMPI(MPIU_Allreduce(&sum, norm, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)mat)));
1832       *norm = PetscSqrtReal(*norm);
1833       PetscCall(PetscLogFlops(2.0 * amat->nz + 2.0 * bmat->nz));
1834     } else if (type == NORM_1) { /* max column norm */
1835       Vec          col;
1836       PetscScalar *array;
1837       PetscInt    *jj, *garray = aij->garray;
1838 
1839       PetscCall(MatCreateVecs(mat, &col, NULL));
1840       PetscCall(VecSet(col, 0.0));
1841       PetscCall(VecGetArrayWrite(col, &array));
1842       v  = amata;
1843       jj = amat->j;
1844       for (j = 0; j < amat->nz; j++) array[*jj++] += PetscAbsScalar(*v++);
1845       PetscCall(VecRestoreArrayWrite(col, &array));
1846       v  = bmata;
1847       jj = bmat->j;
1848       for (j = 0; j < bmat->nz; j++) PetscCall(VecSetValue(col, garray[*jj++], PetscAbsScalar(*v++), ADD_VALUES));
1849       PetscCall(VecAssemblyBegin(col));
1850       PetscCall(VecAssemblyEnd(col));
1851       PetscCall(VecNorm(col, NORM_INFINITY, norm));
1852       PetscCall(VecDestroy(&col));
1853     } else if (type == NORM_INFINITY) { /* max row norm */
1854       PetscReal ntemp = 0.0;
1855       for (j = 0; j < aij->A->rmap->n; j++) {
1856         v   = PetscSafePointerPlusOffset(amata, amat->i[j]);
1857         sum = 0.0;
1858         for (i = 0; i < amat->i[j + 1] - amat->i[j]; i++) {
1859           sum += PetscAbsScalar(*v);
1860           v++;
1861         }
1862         v = PetscSafePointerPlusOffset(bmata, bmat->i[j]);
1863         for (i = 0; i < bmat->i[j + 1] - bmat->i[j]; i++) {
1864           sum += PetscAbsScalar(*v);
1865           v++;
1866         }
1867         if (sum > ntemp) ntemp = sum;
1868       }
1869       PetscCallMPI(MPIU_Allreduce(&ntemp, norm, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)mat)));
1870       PetscCall(PetscLogFlops(PetscMax(amat->nz + bmat->nz - 1, 0)));
1871     } else SETERRQ(PetscObjectComm((PetscObject)mat), PETSC_ERR_SUP, "No support for two norm");
1872     PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &amata));
1873     PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &bmata));
1874   }
1875   PetscFunctionReturn(PETSC_SUCCESS);
1876 }
1877 
1878 static PetscErrorCode MatTranspose_MPIAIJ(Mat A, MatReuse reuse, Mat *matout)
1879 {
1880   Mat_MPIAIJ      *a    = (Mat_MPIAIJ *)A->data, *b;
1881   Mat_SeqAIJ      *Aloc = (Mat_SeqAIJ *)a->A->data, *Bloc = (Mat_SeqAIJ *)a->B->data, *sub_B_diag;
1882   PetscInt         M = A->rmap->N, N = A->cmap->N, ma, na, mb, nb, row, *cols, *cols_tmp, *B_diag_ilen, i, ncol, A_diag_ncol;
1883   const PetscInt  *ai, *aj, *bi, *bj, *B_diag_i;
1884   Mat              B, A_diag, *B_diag;
1885   const MatScalar *pbv, *bv;
1886 
1887   PetscFunctionBegin;
1888   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *matout));
1889   ma = A->rmap->n;
1890   na = A->cmap->n;
1891   mb = a->B->rmap->n;
1892   nb = a->B->cmap->n;
1893   ai = Aloc->i;
1894   aj = Aloc->j;
1895   bi = Bloc->i;
1896   bj = Bloc->j;
1897   if (reuse == MAT_INITIAL_MATRIX || *matout == A) {
1898     PetscInt            *d_nnz, *g_nnz, *o_nnz;
1899     PetscSFNode         *oloc;
1900     PETSC_UNUSED PetscSF sf;
1901 
1902     PetscCall(PetscMalloc4(na, &d_nnz, na, &o_nnz, nb, &g_nnz, nb, &oloc));
1903     /* compute d_nnz for preallocation */
1904     PetscCall(PetscArrayzero(d_nnz, na));
1905     for (i = 0; i < ai[ma]; i++) d_nnz[aj[i]]++;
1906     /* compute local off-diagonal contributions */
1907     PetscCall(PetscArrayzero(g_nnz, nb));
1908     for (i = 0; i < bi[ma]; i++) g_nnz[bj[i]]++;
1909     /* map those to global */
1910     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1911     PetscCall(PetscSFSetGraphLayout(sf, A->cmap, nb, NULL, PETSC_USE_POINTER, a->garray));
1912     PetscCall(PetscSFSetFromOptions(sf));
1913     PetscCall(PetscArrayzero(o_nnz, na));
1914     PetscCall(PetscSFReduceBegin(sf, MPIU_INT, g_nnz, o_nnz, MPI_SUM));
1915     PetscCall(PetscSFReduceEnd(sf, MPIU_INT, g_nnz, o_nnz, MPI_SUM));
1916     PetscCall(PetscSFDestroy(&sf));
1917 
1918     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
1919     PetscCall(MatSetSizes(B, A->cmap->n, A->rmap->n, N, M));
1920     PetscCall(MatSetBlockSizes(B, A->cmap->bs, A->rmap->bs));
1921     PetscCall(MatSetType(B, ((PetscObject)A)->type_name));
1922     PetscCall(MatMPIAIJSetPreallocation(B, 0, d_nnz, 0, o_nnz));
1923     PetscCall(PetscFree4(d_nnz, o_nnz, g_nnz, oloc));
1924   } else {
1925     B = *matout;
1926     PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
1927   }
1928 
1929   b           = (Mat_MPIAIJ *)B->data;
1930   A_diag      = a->A;
1931   B_diag      = &b->A;
1932   sub_B_diag  = (Mat_SeqAIJ *)(*B_diag)->data;
1933   A_diag_ncol = A_diag->cmap->N;
1934   B_diag_ilen = sub_B_diag->ilen;
1935   B_diag_i    = sub_B_diag->i;
1936 
1937   /* Set ilen for diagonal of B */
1938   for (i = 0; i < A_diag_ncol; i++) B_diag_ilen[i] = B_diag_i[i + 1] - B_diag_i[i];
1939 
1940   /* Transpose the diagonal part of the matrix. In contrast to the off-diagonal part, this can be done
1941   very quickly (=without using MatSetValues), because all writes are local. */
1942   PetscCall(MatTransposeSetPrecursor(A_diag, *B_diag));
1943   PetscCall(MatTranspose(A_diag, MAT_REUSE_MATRIX, B_diag));
1944 
1945   /* copy over the B part */
1946   PetscCall(PetscMalloc1(bi[mb], &cols));
1947   PetscCall(MatSeqAIJGetArrayRead(a->B, &bv));
1948   pbv = bv;
1949   row = A->rmap->rstart;
1950   for (i = 0; i < bi[mb]; i++) cols[i] = a->garray[bj[i]];
1951   cols_tmp = cols;
1952   for (i = 0; i < mb; i++) {
1953     ncol = bi[i + 1] - bi[i];
1954     PetscCall(MatSetValues(B, ncol, cols_tmp, 1, &row, pbv, INSERT_VALUES));
1955     row++;
1956     if (pbv) pbv += ncol;
1957     if (cols_tmp) cols_tmp += ncol;
1958   }
1959   PetscCall(PetscFree(cols));
1960   PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bv));
1961 
1962   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1963   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1964   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) {
1965     *matout = B;
1966   } else {
1967     PetscCall(MatHeaderMerge(A, &B));
1968   }
1969   PetscFunctionReturn(PETSC_SUCCESS);
1970 }
1971 
1972 static PetscErrorCode MatDiagonalScale_MPIAIJ(Mat mat, Vec ll, Vec rr)
1973 {
1974   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1975   Mat         a = aij->A, b = aij->B;
1976   PetscInt    s1, s2, s3;
1977 
1978   PetscFunctionBegin;
1979   PetscCall(MatGetLocalSize(mat, &s2, &s3));
1980   if (rr) {
1981     PetscCall(VecGetLocalSize(rr, &s1));
1982     PetscCheck(s1 == s3, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "right vector non-conforming local size");
1983     /* Overlap communication with computation. */
1984     PetscCall(VecScatterBegin(aij->Mvctx, rr, aij->lvec, INSERT_VALUES, SCATTER_FORWARD));
1985   }
1986   if (ll) {
1987     PetscCall(VecGetLocalSize(ll, &s1));
1988     PetscCheck(s1 == s2, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "left vector non-conforming local size");
1989     PetscUseTypeMethod(b, diagonalscale, ll, NULL);
1990   }
1991   /* scale  the diagonal block */
1992   PetscUseTypeMethod(a, diagonalscale, ll, rr);
1993 
1994   if (rr) {
1995     /* Do a scatter end and then right scale the off-diagonal block */
1996     PetscCall(VecScatterEnd(aij->Mvctx, rr, aij->lvec, INSERT_VALUES, SCATTER_FORWARD));
1997     PetscUseTypeMethod(b, diagonalscale, NULL, aij->lvec);
1998   }
1999   PetscFunctionReturn(PETSC_SUCCESS);
2000 }
2001 
2002 static PetscErrorCode MatSetUnfactored_MPIAIJ(Mat A)
2003 {
2004   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2005 
2006   PetscFunctionBegin;
2007   PetscCall(MatSetUnfactored(a->A));
2008   PetscFunctionReturn(PETSC_SUCCESS);
2009 }
2010 
2011 static PetscErrorCode MatEqual_MPIAIJ(Mat A, Mat B, PetscBool *flag)
2012 {
2013   Mat_MPIAIJ *matB = (Mat_MPIAIJ *)B->data, *matA = (Mat_MPIAIJ *)A->data;
2014   Mat         a, b, c, d;
2015   PetscBool   flg;
2016 
2017   PetscFunctionBegin;
2018   a = matA->A;
2019   b = matA->B;
2020   c = matB->A;
2021   d = matB->B;
2022 
2023   PetscCall(MatEqual(a, c, &flg));
2024   if (flg) PetscCall(MatEqual(b, d, &flg));
2025   PetscCallMPI(MPIU_Allreduce(&flg, flag, 1, MPI_C_BOOL, MPI_LAND, PetscObjectComm((PetscObject)A)));
2026   PetscFunctionReturn(PETSC_SUCCESS);
2027 }
2028 
2029 static PetscErrorCode MatCopy_MPIAIJ(Mat A, Mat B, MatStructure str)
2030 {
2031   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2032   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2033 
2034   PetscFunctionBegin;
2035   /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
2036   if ((str != SAME_NONZERO_PATTERN) || (A->ops->copy != B->ops->copy)) {
2037     /* because of the column compression in the off-processor part of the matrix a->B,
2038        the number of columns in a->B and b->B may be different, hence we cannot call
2039        the MatCopy() directly on the two parts. If need be, we can provide a more
2040        efficient copy than the MatCopy_Basic() by first uncompressing the a->B matrices
2041        then copying the submatrices */
2042     PetscCall(MatCopy_Basic(A, B, str));
2043   } else {
2044     PetscCall(MatCopy(a->A, b->A, str));
2045     PetscCall(MatCopy(a->B, b->B, str));
2046   }
2047   PetscCall(PetscObjectStateIncrease((PetscObject)B));
2048   PetscFunctionReturn(PETSC_SUCCESS);
2049 }
2050 
2051 /*
2052    Computes the number of nonzeros per row needed for preallocation when X and Y
2053    have different nonzero structure.
2054 */
2055 PetscErrorCode MatAXPYGetPreallocation_MPIX_private(PetscInt m, const PetscInt *xi, const PetscInt *xj, const PetscInt *xltog, const PetscInt *yi, const PetscInt *yj, const PetscInt *yltog, PetscInt *nnz)
2056 {
2057   PetscInt i, j, k, nzx, nzy;
2058 
2059   PetscFunctionBegin;
2060   /* Set the number of nonzeros in the new matrix */
2061   for (i = 0; i < m; i++) {
2062     const PetscInt *xjj = PetscSafePointerPlusOffset(xj, xi[i]), *yjj = PetscSafePointerPlusOffset(yj, yi[i]);
2063     nzx    = xi[i + 1] - xi[i];
2064     nzy    = yi[i + 1] - yi[i];
2065     nnz[i] = 0;
2066     for (j = 0, k = 0; j < nzx; j++) {                                /* Point in X */
2067       for (; k < nzy && yltog[yjj[k]] < xltog[xjj[j]]; k++) nnz[i]++; /* Catch up to X */
2068       if (k < nzy && yltog[yjj[k]] == xltog[xjj[j]]) k++;             /* Skip duplicate */
2069       nnz[i]++;
2070     }
2071     for (; k < nzy; k++) nnz[i]++;
2072   }
2073   PetscFunctionReturn(PETSC_SUCCESS);
2074 }
2075 
2076 /* This is the same as MatAXPYGetPreallocation_SeqAIJ, except that the local-to-global map is provided */
2077 static PetscErrorCode MatAXPYGetPreallocation_MPIAIJ(Mat Y, const PetscInt *yltog, Mat X, const PetscInt *xltog, PetscInt *nnz)
2078 {
2079   PetscInt    m = Y->rmap->N;
2080   Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data;
2081   Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
2082 
2083   PetscFunctionBegin;
2084   PetscCall(MatAXPYGetPreallocation_MPIX_private(m, x->i, x->j, xltog, y->i, y->j, yltog, nnz));
2085   PetscFunctionReturn(PETSC_SUCCESS);
2086 }
2087 
2088 static PetscErrorCode MatAXPY_MPIAIJ(Mat Y, PetscScalar a, Mat X, MatStructure str)
2089 {
2090   Mat_MPIAIJ *xx = (Mat_MPIAIJ *)X->data, *yy = (Mat_MPIAIJ *)Y->data;
2091 
2092   PetscFunctionBegin;
2093   if (str == SAME_NONZERO_PATTERN) {
2094     PetscCall(MatAXPY(yy->A, a, xx->A, str));
2095     PetscCall(MatAXPY(yy->B, a, xx->B, str));
2096   } else if (str == SUBSET_NONZERO_PATTERN) { /* nonzeros of X is a subset of Y's */
2097     PetscCall(MatAXPY_Basic(Y, a, X, str));
2098   } else {
2099     Mat       B;
2100     PetscInt *nnz_d, *nnz_o;
2101 
2102     PetscCall(PetscMalloc1(yy->A->rmap->N, &nnz_d));
2103     PetscCall(PetscMalloc1(yy->B->rmap->N, &nnz_o));
2104     PetscCall(MatCreate(PetscObjectComm((PetscObject)Y), &B));
2105     PetscCall(PetscObjectSetName((PetscObject)B, ((PetscObject)Y)->name));
2106     PetscCall(MatSetLayouts(B, Y->rmap, Y->cmap));
2107     PetscCall(MatSetType(B, ((PetscObject)Y)->type_name));
2108     PetscCall(MatAXPYGetPreallocation_SeqAIJ(yy->A, xx->A, nnz_d));
2109     PetscCall(MatAXPYGetPreallocation_MPIAIJ(yy->B, yy->garray, xx->B, xx->garray, nnz_o));
2110     PetscCall(MatMPIAIJSetPreallocation(B, 0, nnz_d, 0, nnz_o));
2111     PetscCall(MatAXPY_BasicWithPreallocation(B, Y, a, X, str));
2112     PetscCall(MatHeaderMerge(Y, &B));
2113     PetscCall(PetscFree(nnz_d));
2114     PetscCall(PetscFree(nnz_o));
2115   }
2116   PetscFunctionReturn(PETSC_SUCCESS);
2117 }
2118 
2119 PETSC_INTERN PetscErrorCode MatConjugate_SeqAIJ(Mat);
2120 
2121 static PetscErrorCode MatConjugate_MPIAIJ(Mat mat)
2122 {
2123   PetscFunctionBegin;
2124   if (PetscDefined(USE_COMPLEX)) {
2125     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2126 
2127     PetscCall(MatConjugate_SeqAIJ(aij->A));
2128     PetscCall(MatConjugate_SeqAIJ(aij->B));
2129   }
2130   PetscFunctionReturn(PETSC_SUCCESS);
2131 }
2132 
2133 static PetscErrorCode MatRealPart_MPIAIJ(Mat A)
2134 {
2135   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2136 
2137   PetscFunctionBegin;
2138   PetscCall(MatRealPart(a->A));
2139   PetscCall(MatRealPart(a->B));
2140   PetscFunctionReturn(PETSC_SUCCESS);
2141 }
2142 
2143 static PetscErrorCode MatImaginaryPart_MPIAIJ(Mat A)
2144 {
2145   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2146 
2147   PetscFunctionBegin;
2148   PetscCall(MatImaginaryPart(a->A));
2149   PetscCall(MatImaginaryPart(a->B));
2150   PetscFunctionReturn(PETSC_SUCCESS);
2151 }
2152 
2153 static PetscErrorCode MatGetRowMaxAbs_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2154 {
2155   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data;
2156   PetscInt           i, *idxb = NULL, m = A->rmap->n;
2157   PetscScalar       *vv;
2158   Vec                vB, vA;
2159   const PetscScalar *va, *vb;
2160 
2161   PetscFunctionBegin;
2162   PetscCall(MatCreateVecs(a->A, NULL, &vA));
2163   PetscCall(MatGetRowMaxAbs(a->A, vA, idx));
2164 
2165   PetscCall(VecGetArrayRead(vA, &va));
2166   if (idx) {
2167     for (i = 0; i < m; i++) {
2168       if (PetscAbsScalar(va[i])) idx[i] += A->cmap->rstart;
2169     }
2170   }
2171 
2172   PetscCall(MatCreateVecs(a->B, NULL, &vB));
2173   PetscCall(PetscMalloc1(m, &idxb));
2174   PetscCall(MatGetRowMaxAbs(a->B, vB, idxb));
2175 
2176   PetscCall(VecGetArrayWrite(v, &vv));
2177   PetscCall(VecGetArrayRead(vB, &vb));
2178   for (i = 0; i < m; i++) {
2179     if (PetscAbsScalar(va[i]) < PetscAbsScalar(vb[i])) {
2180       vv[i] = vb[i];
2181       if (idx) idx[i] = a->garray[idxb[i]];
2182     } else {
2183       vv[i] = va[i];
2184       if (idx && PetscAbsScalar(va[i]) == PetscAbsScalar(vb[i]) && idxb[i] != -1 && idx[i] > a->garray[idxb[i]]) idx[i] = a->garray[idxb[i]];
2185     }
2186   }
2187   PetscCall(VecRestoreArrayWrite(v, &vv));
2188   PetscCall(VecRestoreArrayRead(vA, &va));
2189   PetscCall(VecRestoreArrayRead(vB, &vb));
2190   PetscCall(PetscFree(idxb));
2191   PetscCall(VecDestroy(&vA));
2192   PetscCall(VecDestroy(&vB));
2193   PetscFunctionReturn(PETSC_SUCCESS);
2194 }
2195 
2196 static PetscErrorCode MatGetRowSumAbs_MPIAIJ(Mat A, Vec v)
2197 {
2198   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2199   Vec         vB, vA;
2200 
2201   PetscFunctionBegin;
2202   PetscCall(MatCreateVecs(a->A, NULL, &vA));
2203   PetscCall(MatGetRowSumAbs(a->A, vA));
2204   PetscCall(MatCreateVecs(a->B, NULL, &vB));
2205   PetscCall(MatGetRowSumAbs(a->B, vB));
2206   PetscCall(VecAXPY(vA, 1.0, vB));
2207   PetscCall(VecDestroy(&vB));
2208   PetscCall(VecCopy(vA, v));
2209   PetscCall(VecDestroy(&vA));
2210   PetscFunctionReturn(PETSC_SUCCESS);
2211 }
2212 
2213 static PetscErrorCode MatGetRowMinAbs_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2214 {
2215   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2216   PetscInt           m = A->rmap->n, n = A->cmap->n;
2217   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2218   PetscInt          *cmap = mat->garray;
2219   PetscInt          *diagIdx, *offdiagIdx;
2220   Vec                diagV, offdiagV;
2221   PetscScalar       *a, *diagA, *offdiagA;
2222   const PetscScalar *ba, *bav;
2223   PetscInt           r, j, col, ncols, *bi, *bj;
2224   Mat                B = mat->B;
2225   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2226 
2227   PetscFunctionBegin;
2228   /* When a process holds entire A and other processes have no entry */
2229   if (A->cmap->N == n) {
2230     PetscCall(VecGetArrayWrite(v, &diagA));
2231     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2232     PetscCall(MatGetRowMinAbs(mat->A, diagV, idx));
2233     PetscCall(VecDestroy(&diagV));
2234     PetscCall(VecRestoreArrayWrite(v, &diagA));
2235     PetscFunctionReturn(PETSC_SUCCESS);
2236   } else if (n == 0) {
2237     if (m) {
2238       PetscCall(VecGetArrayWrite(v, &a));
2239       for (r = 0; r < m; r++) {
2240         a[r] = 0.0;
2241         if (idx) idx[r] = -1;
2242       }
2243       PetscCall(VecRestoreArrayWrite(v, &a));
2244     }
2245     PetscFunctionReturn(PETSC_SUCCESS);
2246   }
2247 
2248   PetscCall(PetscMalloc2(m, &diagIdx, m, &offdiagIdx));
2249   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2250   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2251   PetscCall(MatGetRowMinAbs(mat->A, diagV, diagIdx));
2252 
2253   /* Get offdiagIdx[] for implicit 0.0 */
2254   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2255   ba = bav;
2256   bi = b->i;
2257   bj = b->j;
2258   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2259   for (r = 0; r < m; r++) {
2260     ncols = bi[r + 1] - bi[r];
2261     if (ncols == A->cmap->N - n) { /* Brow is dense */
2262       offdiagA[r]   = *ba;
2263       offdiagIdx[r] = cmap[0];
2264     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2265       offdiagA[r] = 0.0;
2266 
2267       /* Find first hole in the cmap */
2268       for (j = 0; j < ncols; j++) {
2269         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2270         if (col > j && j < cstart) {
2271           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2272           break;
2273         } else if (col > j + n && j >= cstart) {
2274           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2275           break;
2276         }
2277       }
2278       if (j == ncols && ncols < A->cmap->N - n) {
2279         /* a hole is outside compressed Bcols */
2280         if (ncols == 0) {
2281           if (cstart) {
2282             offdiagIdx[r] = 0;
2283           } else offdiagIdx[r] = cend;
2284         } else { /* ncols > 0 */
2285           offdiagIdx[r] = cmap[ncols - 1] + 1;
2286           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2287         }
2288       }
2289     }
2290 
2291     for (j = 0; j < ncols; j++) {
2292       if (PetscAbsScalar(offdiagA[r]) > PetscAbsScalar(*ba)) {
2293         offdiagA[r]   = *ba;
2294         offdiagIdx[r] = cmap[*bj];
2295       }
2296       ba++;
2297       bj++;
2298     }
2299   }
2300 
2301   PetscCall(VecGetArrayWrite(v, &a));
2302   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2303   for (r = 0; r < m; ++r) {
2304     if (PetscAbsScalar(diagA[r]) < PetscAbsScalar(offdiagA[r])) {
2305       a[r] = diagA[r];
2306       if (idx) idx[r] = cstart + diagIdx[r];
2307     } else if (PetscAbsScalar(diagA[r]) == PetscAbsScalar(offdiagA[r])) {
2308       a[r] = diagA[r];
2309       if (idx) {
2310         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2311           idx[r] = cstart + diagIdx[r];
2312         } else idx[r] = offdiagIdx[r];
2313       }
2314     } else {
2315       a[r] = offdiagA[r];
2316       if (idx) idx[r] = offdiagIdx[r];
2317     }
2318   }
2319   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2320   PetscCall(VecRestoreArrayWrite(v, &a));
2321   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2322   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2323   PetscCall(VecDestroy(&diagV));
2324   PetscCall(VecDestroy(&offdiagV));
2325   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2326   PetscFunctionReturn(PETSC_SUCCESS);
2327 }
2328 
2329 static PetscErrorCode MatGetRowMin_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2330 {
2331   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2332   PetscInt           m = A->rmap->n, n = A->cmap->n;
2333   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2334   PetscInt          *cmap = mat->garray;
2335   PetscInt          *diagIdx, *offdiagIdx;
2336   Vec                diagV, offdiagV;
2337   PetscScalar       *a, *diagA, *offdiagA;
2338   const PetscScalar *ba, *bav;
2339   PetscInt           r, j, col, ncols, *bi, *bj;
2340   Mat                B = mat->B;
2341   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2342 
2343   PetscFunctionBegin;
2344   /* When a process holds entire A and other processes have no entry */
2345   if (A->cmap->N == n) {
2346     PetscCall(VecGetArrayWrite(v, &diagA));
2347     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2348     PetscCall(MatGetRowMin(mat->A, diagV, idx));
2349     PetscCall(VecDestroy(&diagV));
2350     PetscCall(VecRestoreArrayWrite(v, &diagA));
2351     PetscFunctionReturn(PETSC_SUCCESS);
2352   } else if (n == 0) {
2353     if (m) {
2354       PetscCall(VecGetArrayWrite(v, &a));
2355       for (r = 0; r < m; r++) {
2356         a[r] = PETSC_MAX_REAL;
2357         if (idx) idx[r] = -1;
2358       }
2359       PetscCall(VecRestoreArrayWrite(v, &a));
2360     }
2361     PetscFunctionReturn(PETSC_SUCCESS);
2362   }
2363 
2364   PetscCall(PetscCalloc2(m, &diagIdx, m, &offdiagIdx));
2365   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2366   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2367   PetscCall(MatGetRowMin(mat->A, diagV, diagIdx));
2368 
2369   /* Get offdiagIdx[] for implicit 0.0 */
2370   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2371   ba = bav;
2372   bi = b->i;
2373   bj = b->j;
2374   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2375   for (r = 0; r < m; r++) {
2376     ncols = bi[r + 1] - bi[r];
2377     if (ncols == A->cmap->N - n) { /* Brow is dense */
2378       offdiagA[r]   = *ba;
2379       offdiagIdx[r] = cmap[0];
2380     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2381       offdiagA[r] = 0.0;
2382 
2383       /* Find first hole in the cmap */
2384       for (j = 0; j < ncols; j++) {
2385         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2386         if (col > j && j < cstart) {
2387           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2388           break;
2389         } else if (col > j + n && j >= cstart) {
2390           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2391           break;
2392         }
2393       }
2394       if (j == ncols && ncols < A->cmap->N - n) {
2395         /* a hole is outside compressed Bcols */
2396         if (ncols == 0) {
2397           if (cstart) {
2398             offdiagIdx[r] = 0;
2399           } else offdiagIdx[r] = cend;
2400         } else { /* ncols > 0 */
2401           offdiagIdx[r] = cmap[ncols - 1] + 1;
2402           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2403         }
2404       }
2405     }
2406 
2407     for (j = 0; j < ncols; j++) {
2408       if (PetscRealPart(offdiagA[r]) > PetscRealPart(*ba)) {
2409         offdiagA[r]   = *ba;
2410         offdiagIdx[r] = cmap[*bj];
2411       }
2412       ba++;
2413       bj++;
2414     }
2415   }
2416 
2417   PetscCall(VecGetArrayWrite(v, &a));
2418   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2419   for (r = 0; r < m; ++r) {
2420     if (PetscRealPart(diagA[r]) < PetscRealPart(offdiagA[r])) {
2421       a[r] = diagA[r];
2422       if (idx) idx[r] = cstart + diagIdx[r];
2423     } else if (PetscRealPart(diagA[r]) == PetscRealPart(offdiagA[r])) {
2424       a[r] = diagA[r];
2425       if (idx) {
2426         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2427           idx[r] = cstart + diagIdx[r];
2428         } else idx[r] = offdiagIdx[r];
2429       }
2430     } else {
2431       a[r] = offdiagA[r];
2432       if (idx) idx[r] = offdiagIdx[r];
2433     }
2434   }
2435   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2436   PetscCall(VecRestoreArrayWrite(v, &a));
2437   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2438   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2439   PetscCall(VecDestroy(&diagV));
2440   PetscCall(VecDestroy(&offdiagV));
2441   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2442   PetscFunctionReturn(PETSC_SUCCESS);
2443 }
2444 
2445 static PetscErrorCode MatGetRowMax_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2446 {
2447   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2448   PetscInt           m = A->rmap->n, n = A->cmap->n;
2449   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2450   PetscInt          *cmap = mat->garray;
2451   PetscInt          *diagIdx, *offdiagIdx;
2452   Vec                diagV, offdiagV;
2453   PetscScalar       *a, *diagA, *offdiagA;
2454   const PetscScalar *ba, *bav;
2455   PetscInt           r, j, col, ncols, *bi, *bj;
2456   Mat                B = mat->B;
2457   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2458 
2459   PetscFunctionBegin;
2460   /* When a process holds entire A and other processes have no entry */
2461   if (A->cmap->N == n) {
2462     PetscCall(VecGetArrayWrite(v, &diagA));
2463     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2464     PetscCall(MatGetRowMax(mat->A, diagV, idx));
2465     PetscCall(VecDestroy(&diagV));
2466     PetscCall(VecRestoreArrayWrite(v, &diagA));
2467     PetscFunctionReturn(PETSC_SUCCESS);
2468   } else if (n == 0) {
2469     if (m) {
2470       PetscCall(VecGetArrayWrite(v, &a));
2471       for (r = 0; r < m; r++) {
2472         a[r] = PETSC_MIN_REAL;
2473         if (idx) idx[r] = -1;
2474       }
2475       PetscCall(VecRestoreArrayWrite(v, &a));
2476     }
2477     PetscFunctionReturn(PETSC_SUCCESS);
2478   }
2479 
2480   PetscCall(PetscMalloc2(m, &diagIdx, m, &offdiagIdx));
2481   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2482   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2483   PetscCall(MatGetRowMax(mat->A, diagV, diagIdx));
2484 
2485   /* Get offdiagIdx[] for implicit 0.0 */
2486   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2487   ba = bav;
2488   bi = b->i;
2489   bj = b->j;
2490   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2491   for (r = 0; r < m; r++) {
2492     ncols = bi[r + 1] - bi[r];
2493     if (ncols == A->cmap->N - n) { /* Brow is dense */
2494       offdiagA[r]   = *ba;
2495       offdiagIdx[r] = cmap[0];
2496     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2497       offdiagA[r] = 0.0;
2498 
2499       /* Find first hole in the cmap */
2500       for (j = 0; j < ncols; j++) {
2501         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2502         if (col > j && j < cstart) {
2503           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2504           break;
2505         } else if (col > j + n && j >= cstart) {
2506           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2507           break;
2508         }
2509       }
2510       if (j == ncols && ncols < A->cmap->N - n) {
2511         /* a hole is outside compressed Bcols */
2512         if (ncols == 0) {
2513           if (cstart) {
2514             offdiagIdx[r] = 0;
2515           } else offdiagIdx[r] = cend;
2516         } else { /* ncols > 0 */
2517           offdiagIdx[r] = cmap[ncols - 1] + 1;
2518           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2519         }
2520       }
2521     }
2522 
2523     for (j = 0; j < ncols; j++) {
2524       if (PetscRealPart(offdiagA[r]) < PetscRealPart(*ba)) {
2525         offdiagA[r]   = *ba;
2526         offdiagIdx[r] = cmap[*bj];
2527       }
2528       ba++;
2529       bj++;
2530     }
2531   }
2532 
2533   PetscCall(VecGetArrayWrite(v, &a));
2534   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2535   for (r = 0; r < m; ++r) {
2536     if (PetscRealPart(diagA[r]) > PetscRealPart(offdiagA[r])) {
2537       a[r] = diagA[r];
2538       if (idx) idx[r] = cstart + diagIdx[r];
2539     } else if (PetscRealPart(diagA[r]) == PetscRealPart(offdiagA[r])) {
2540       a[r] = diagA[r];
2541       if (idx) {
2542         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2543           idx[r] = cstart + diagIdx[r];
2544         } else idx[r] = offdiagIdx[r];
2545       }
2546     } else {
2547       a[r] = offdiagA[r];
2548       if (idx) idx[r] = offdiagIdx[r];
2549     }
2550   }
2551   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2552   PetscCall(VecRestoreArrayWrite(v, &a));
2553   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2554   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2555   PetscCall(VecDestroy(&diagV));
2556   PetscCall(VecDestroy(&offdiagV));
2557   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2558   PetscFunctionReturn(PETSC_SUCCESS);
2559 }
2560 
2561 PetscErrorCode MatGetSeqNonzeroStructure_MPIAIJ(Mat mat, Mat *newmat)
2562 {
2563   Mat *dummy;
2564 
2565   PetscFunctionBegin;
2566   PetscCall(MatCreateSubMatrix_MPIAIJ_All(mat, MAT_DO_NOT_GET_VALUES, MAT_INITIAL_MATRIX, &dummy));
2567   *newmat = *dummy;
2568   PetscCall(PetscFree(dummy));
2569   PetscFunctionReturn(PETSC_SUCCESS);
2570 }
2571 
2572 static PetscErrorCode MatInvertBlockDiagonal_MPIAIJ(Mat A, const PetscScalar **values)
2573 {
2574   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2575 
2576   PetscFunctionBegin;
2577   PetscCall(MatInvertBlockDiagonal(a->A, values));
2578   A->factorerrortype = a->A->factorerrortype;
2579   PetscFunctionReturn(PETSC_SUCCESS);
2580 }
2581 
2582 static PetscErrorCode MatSetRandom_MPIAIJ(Mat x, PetscRandom rctx)
2583 {
2584   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)x->data;
2585 
2586   PetscFunctionBegin;
2587   PetscCheck(x->assembled || x->preallocated, PetscObjectComm((PetscObject)x), PETSC_ERR_ARG_WRONGSTATE, "MatSetRandom on an unassembled and unpreallocated MATMPIAIJ is not allowed");
2588   PetscCall(MatSetRandom(aij->A, rctx));
2589   if (x->assembled) {
2590     PetscCall(MatSetRandom(aij->B, rctx));
2591   } else {
2592     PetscCall(MatSetRandomSkipColumnRange_SeqAIJ_Private(aij->B, x->cmap->rstart, x->cmap->rend, rctx));
2593   }
2594   PetscCall(MatAssemblyBegin(x, MAT_FINAL_ASSEMBLY));
2595   PetscCall(MatAssemblyEnd(x, MAT_FINAL_ASSEMBLY));
2596   PetscFunctionReturn(PETSC_SUCCESS);
2597 }
2598 
2599 static PetscErrorCode MatMPIAIJSetUseScalableIncreaseOverlap_MPIAIJ(Mat A, PetscBool sc)
2600 {
2601   PetscFunctionBegin;
2602   if (sc) A->ops->increaseoverlap = MatIncreaseOverlap_MPIAIJ_Scalable;
2603   else A->ops->increaseoverlap = MatIncreaseOverlap_MPIAIJ;
2604   PetscFunctionReturn(PETSC_SUCCESS);
2605 }
2606 
2607 /*@
2608   MatMPIAIJGetNumberNonzeros - gets the number of nonzeros in the matrix on this MPI rank
2609 
2610   Not Collective
2611 
2612   Input Parameter:
2613 . A - the matrix
2614 
2615   Output Parameter:
2616 . nz - the number of nonzeros
2617 
2618   Level: advanced
2619 
2620 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`
2621 @*/
2622 PetscErrorCode MatMPIAIJGetNumberNonzeros(Mat A, PetscCount *nz)
2623 {
2624   Mat_MPIAIJ *maij = (Mat_MPIAIJ *)A->data;
2625   Mat_SeqAIJ *aaij = (Mat_SeqAIJ *)maij->A->data, *baij = (Mat_SeqAIJ *)maij->B->data;
2626   PetscBool   isaij;
2627 
2628   PetscFunctionBegin;
2629   PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATMPIAIJ, &isaij));
2630   PetscCheck(isaij, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
2631   *nz = aaij->i[A->rmap->n] + baij->i[A->rmap->n];
2632   PetscFunctionReturn(PETSC_SUCCESS);
2633 }
2634 
2635 /*@
2636   MatMPIAIJSetUseScalableIncreaseOverlap - Determine if the matrix uses a scalable algorithm to compute the overlap
2637 
2638   Collective
2639 
2640   Input Parameters:
2641 + A  - the matrix
2642 - sc - `PETSC_TRUE` indicates use the scalable algorithm (default is not to use the scalable algorithm)
2643 
2644   Level: advanced
2645 
2646 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`
2647 @*/
2648 PetscErrorCode MatMPIAIJSetUseScalableIncreaseOverlap(Mat A, PetscBool sc)
2649 {
2650   PetscFunctionBegin;
2651   PetscTryMethod(A, "MatMPIAIJSetUseScalableIncreaseOverlap_C", (Mat, PetscBool), (A, sc));
2652   PetscFunctionReturn(PETSC_SUCCESS);
2653 }
2654 
2655 PetscErrorCode MatSetFromOptions_MPIAIJ(Mat A, PetscOptionItems PetscOptionsObject)
2656 {
2657   PetscBool sc = PETSC_FALSE, flg;
2658 
2659   PetscFunctionBegin;
2660   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJ options");
2661   if (A->ops->increaseoverlap == MatIncreaseOverlap_MPIAIJ_Scalable) sc = PETSC_TRUE;
2662   PetscCall(PetscOptionsBool("-mat_increase_overlap_scalable", "Use a scalable algorithm to compute the overlap", "MatIncreaseOverlap", sc, &sc, &flg));
2663   if (flg) PetscCall(MatMPIAIJSetUseScalableIncreaseOverlap(A, sc));
2664   PetscOptionsHeadEnd();
2665   PetscFunctionReturn(PETSC_SUCCESS);
2666 }
2667 
2668 static PetscErrorCode MatShift_MPIAIJ(Mat Y, PetscScalar a)
2669 {
2670   Mat_MPIAIJ *maij = (Mat_MPIAIJ *)Y->data;
2671   Mat_SeqAIJ *aij  = (Mat_SeqAIJ *)maij->A->data;
2672 
2673   PetscFunctionBegin;
2674   if (!Y->preallocated) {
2675     PetscCall(MatMPIAIJSetPreallocation(Y, 1, NULL, 0, NULL));
2676   } else if (!aij->nz) { /* It does not matter if diagonals of Y only partially lie in maij->A. We just need an estimated preallocation. */
2677     PetscInt nonew = aij->nonew;
2678     PetscCall(MatSeqAIJSetPreallocation(maij->A, 1, NULL));
2679     aij->nonew = nonew;
2680   }
2681   PetscCall(MatShift_Basic(Y, a));
2682   PetscFunctionReturn(PETSC_SUCCESS);
2683 }
2684 
2685 static PetscErrorCode MatInvertVariableBlockDiagonal_MPIAIJ(Mat A, PetscInt nblocks, const PetscInt *bsizes, PetscScalar *diag)
2686 {
2687   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2688 
2689   PetscFunctionBegin;
2690   PetscCall(MatInvertVariableBlockDiagonal(a->A, nblocks, bsizes, diag));
2691   PetscFunctionReturn(PETSC_SUCCESS);
2692 }
2693 
2694 static PetscErrorCode MatEliminateZeros_MPIAIJ(Mat A, PetscBool keep)
2695 {
2696   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2697 
2698   PetscFunctionBegin;
2699   PetscCall(MatEliminateZeros_SeqAIJ(a->A, keep));        // possibly keep zero diagonal coefficients
2700   PetscCall(MatEliminateZeros_SeqAIJ(a->B, PETSC_FALSE)); // never keep zero diagonal coefficients
2701   PetscFunctionReturn(PETSC_SUCCESS);
2702 }
2703 
2704 static struct _MatOps MatOps_Values = {MatSetValues_MPIAIJ,
2705                                        MatGetRow_MPIAIJ,
2706                                        MatRestoreRow_MPIAIJ,
2707                                        MatMult_MPIAIJ,
2708                                        /* 4*/ MatMultAdd_MPIAIJ,
2709                                        MatMultTranspose_MPIAIJ,
2710                                        MatMultTransposeAdd_MPIAIJ,
2711                                        NULL,
2712                                        NULL,
2713                                        NULL,
2714                                        /*10*/ NULL,
2715                                        NULL,
2716                                        NULL,
2717                                        MatSOR_MPIAIJ,
2718                                        MatTranspose_MPIAIJ,
2719                                        /*15*/ MatGetInfo_MPIAIJ,
2720                                        MatEqual_MPIAIJ,
2721                                        MatGetDiagonal_MPIAIJ,
2722                                        MatDiagonalScale_MPIAIJ,
2723                                        MatNorm_MPIAIJ,
2724                                        /*20*/ MatAssemblyBegin_MPIAIJ,
2725                                        MatAssemblyEnd_MPIAIJ,
2726                                        MatSetOption_MPIAIJ,
2727                                        MatZeroEntries_MPIAIJ,
2728                                        /*24*/ MatZeroRows_MPIAIJ,
2729                                        NULL,
2730                                        NULL,
2731                                        NULL,
2732                                        NULL,
2733                                        /*29*/ MatSetUp_MPI_Hash,
2734                                        NULL,
2735                                        NULL,
2736                                        MatGetDiagonalBlock_MPIAIJ,
2737                                        NULL,
2738                                        /*34*/ MatDuplicate_MPIAIJ,
2739                                        NULL,
2740                                        NULL,
2741                                        NULL,
2742                                        NULL,
2743                                        /*39*/ MatAXPY_MPIAIJ,
2744                                        MatCreateSubMatrices_MPIAIJ,
2745                                        MatIncreaseOverlap_MPIAIJ,
2746                                        MatGetValues_MPIAIJ,
2747                                        MatCopy_MPIAIJ,
2748                                        /*44*/ MatGetRowMax_MPIAIJ,
2749                                        MatScale_MPIAIJ,
2750                                        MatShift_MPIAIJ,
2751                                        MatDiagonalSet_MPIAIJ,
2752                                        MatZeroRowsColumns_MPIAIJ,
2753                                        /*49*/ MatSetRandom_MPIAIJ,
2754                                        MatGetRowIJ_MPIAIJ,
2755                                        MatRestoreRowIJ_MPIAIJ,
2756                                        NULL,
2757                                        NULL,
2758                                        /*54*/ MatFDColoringCreate_MPIXAIJ,
2759                                        NULL,
2760                                        MatSetUnfactored_MPIAIJ,
2761                                        MatPermute_MPIAIJ,
2762                                        NULL,
2763                                        /*59*/ MatCreateSubMatrix_MPIAIJ,
2764                                        MatDestroy_MPIAIJ,
2765                                        MatView_MPIAIJ,
2766                                        NULL,
2767                                        NULL,
2768                                        /*64*/ MatMatMatMultNumeric_MPIAIJ_MPIAIJ_MPIAIJ,
2769                                        NULL,
2770                                        NULL,
2771                                        NULL,
2772                                        MatGetRowMaxAbs_MPIAIJ,
2773                                        /*69*/ MatGetRowMinAbs_MPIAIJ,
2774                                        NULL,
2775                                        NULL,
2776                                        MatFDColoringApply_AIJ,
2777                                        MatSetFromOptions_MPIAIJ,
2778                                        MatFindZeroDiagonals_MPIAIJ,
2779                                        /*75*/ NULL,
2780                                        NULL,
2781                                        NULL,
2782                                        MatLoad_MPIAIJ,
2783                                        NULL,
2784                                        /*80*/ NULL,
2785                                        NULL,
2786                                        NULL,
2787                                        /*83*/ NULL,
2788                                        NULL,
2789                                        MatMatMultNumeric_MPIAIJ_MPIAIJ,
2790                                        MatPtAPNumeric_MPIAIJ_MPIAIJ,
2791                                        NULL,
2792                                        NULL,
2793                                        /*89*/ MatBindToCPU_MPIAIJ,
2794                                        MatProductSetFromOptions_MPIAIJ,
2795                                        NULL,
2796                                        NULL,
2797                                        MatConjugate_MPIAIJ,
2798                                        /*94*/ NULL,
2799                                        MatSetValuesRow_MPIAIJ,
2800                                        MatRealPart_MPIAIJ,
2801                                        MatImaginaryPart_MPIAIJ,
2802                                        NULL,
2803                                        /*99*/ NULL,
2804                                        NULL,
2805                                        NULL,
2806                                        MatGetRowMin_MPIAIJ,
2807                                        NULL,
2808                                        /*104*/ MatGetSeqNonzeroStructure_MPIAIJ,
2809                                        NULL,
2810                                        MatGetGhosts_MPIAIJ,
2811                                        NULL,
2812                                        NULL,
2813                                        /*109*/ MatMultDiagonalBlock_MPIAIJ,
2814                                        NULL,
2815                                        NULL,
2816                                        NULL,
2817                                        MatGetMultiProcBlock_MPIAIJ,
2818                                        /*114*/ MatFindNonzeroRows_MPIAIJ,
2819                                        MatGetColumnReductions_MPIAIJ,
2820                                        MatInvertBlockDiagonal_MPIAIJ,
2821                                        MatInvertVariableBlockDiagonal_MPIAIJ,
2822                                        MatCreateSubMatricesMPI_MPIAIJ,
2823                                        /*119*/ NULL,
2824                                        NULL,
2825                                        MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ,
2826                                        NULL,
2827                                        NULL,
2828                                        /*124*/ NULL,
2829                                        NULL,
2830                                        MatSetBlockSizes_MPIAIJ,
2831                                        NULL,
2832                                        MatFDColoringSetUp_MPIXAIJ,
2833                                        /*129*/ MatFindOffBlockDiagonalEntries_MPIAIJ,
2834                                        MatCreateMPIMatConcatenateSeqMat_MPIAIJ,
2835                                        NULL,
2836                                        NULL,
2837                                        NULL,
2838                                        /*134*/ MatCreateGraph_Simple_AIJ,
2839                                        NULL,
2840                                        MatEliminateZeros_MPIAIJ,
2841                                        MatGetRowSumAbs_MPIAIJ,
2842                                        NULL,
2843                                        /*139*/ NULL,
2844                                        NULL,
2845                                        MatCopyHashToXAIJ_MPI_Hash,
2846                                        MatGetCurrentMemType_MPIAIJ,
2847                                        NULL};
2848 
2849 static PetscErrorCode MatStoreValues_MPIAIJ(Mat mat)
2850 {
2851   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2852 
2853   PetscFunctionBegin;
2854   PetscCall(MatStoreValues(aij->A));
2855   PetscCall(MatStoreValues(aij->B));
2856   PetscFunctionReturn(PETSC_SUCCESS);
2857 }
2858 
2859 static PetscErrorCode MatRetrieveValues_MPIAIJ(Mat mat)
2860 {
2861   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2862 
2863   PetscFunctionBegin;
2864   PetscCall(MatRetrieveValues(aij->A));
2865   PetscCall(MatRetrieveValues(aij->B));
2866   PetscFunctionReturn(PETSC_SUCCESS);
2867 }
2868 
2869 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJ(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
2870 {
2871   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2872   PetscMPIInt size;
2873 
2874   PetscFunctionBegin;
2875   if (B->hash_active) {
2876     B->ops[0]      = b->cops;
2877     B->hash_active = PETSC_FALSE;
2878   }
2879   PetscCall(PetscLayoutSetUp(B->rmap));
2880   PetscCall(PetscLayoutSetUp(B->cmap));
2881 
2882 #if defined(PETSC_USE_CTABLE)
2883   PetscCall(PetscHMapIDestroy(&b->colmap));
2884 #else
2885   PetscCall(PetscFree(b->colmap));
2886 #endif
2887   PetscCall(PetscFree(b->garray));
2888   PetscCall(VecDestroy(&b->lvec));
2889   PetscCall(VecScatterDestroy(&b->Mvctx));
2890 
2891   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2892 
2893   MatSeqXAIJGetOptions_Private(b->B);
2894   PetscCall(MatDestroy(&b->B));
2895   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
2896   PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
2897   PetscCall(MatSetBlockSizesFromMats(b->B, B, B));
2898   PetscCall(MatSetType(b->B, MATSEQAIJ));
2899   MatSeqXAIJRestoreOptions_Private(b->B);
2900 
2901   MatSeqXAIJGetOptions_Private(b->A);
2902   PetscCall(MatDestroy(&b->A));
2903   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
2904   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
2905   PetscCall(MatSetBlockSizesFromMats(b->A, B, B));
2906   PetscCall(MatSetType(b->A, MATSEQAIJ));
2907   MatSeqXAIJRestoreOptions_Private(b->A);
2908 
2909   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
2910   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
2911   B->preallocated  = PETSC_TRUE;
2912   B->was_assembled = PETSC_FALSE;
2913   B->assembled     = PETSC_FALSE;
2914   PetscFunctionReturn(PETSC_SUCCESS);
2915 }
2916 
2917 static PetscErrorCode MatResetPreallocation_MPIAIJ(Mat B)
2918 {
2919   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2920   PetscBool   ondiagreset, offdiagreset, memoryreset;
2921 
2922   PetscFunctionBegin;
2923   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
2924   PetscCheck(B->insertmode == NOT_SET_VALUES, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot reset preallocation after setting some values but not yet calling MatAssemblyBegin()/MatAssemblyEnd()");
2925   if (B->num_ass == 0) PetscFunctionReturn(PETSC_SUCCESS);
2926 
2927   PetscCall(MatResetPreallocation_SeqAIJ_Private(b->A, &ondiagreset));
2928   PetscCall(MatResetPreallocation_SeqAIJ_Private(b->B, &offdiagreset));
2929   memoryreset = (PetscBool)(ondiagreset || offdiagreset);
2930   PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &memoryreset, 1, MPI_C_BOOL, MPI_LOR, PetscObjectComm((PetscObject)B)));
2931   if (!memoryreset) PetscFunctionReturn(PETSC_SUCCESS);
2932 
2933   PetscCall(PetscLayoutSetUp(B->rmap));
2934   PetscCall(PetscLayoutSetUp(B->cmap));
2935   PetscCheck(B->assembled || B->was_assembled, PetscObjectComm((PetscObject)B), PETSC_ERR_ARG_WRONGSTATE, "Should not need to reset preallocation if the matrix was never assembled");
2936   PetscCall(MatDisAssemble_MPIAIJ(B, PETSC_TRUE));
2937   PetscCall(VecScatterDestroy(&b->Mvctx));
2938 
2939   B->preallocated  = PETSC_TRUE;
2940   B->was_assembled = PETSC_FALSE;
2941   B->assembled     = PETSC_FALSE;
2942   /* Log that the state of this object has changed; this will help guarantee that preconditioners get re-setup */
2943   PetscCall(PetscObjectStateIncrease((PetscObject)B));
2944   PetscFunctionReturn(PETSC_SUCCESS);
2945 }
2946 
2947 PetscErrorCode MatDuplicate_MPIAIJ(Mat matin, MatDuplicateOption cpvalues, Mat *newmat)
2948 {
2949   Mat         mat;
2950   Mat_MPIAIJ *a, *oldmat = (Mat_MPIAIJ *)matin->data;
2951 
2952   PetscFunctionBegin;
2953   *newmat = NULL;
2954   PetscCall(MatCreate(PetscObjectComm((PetscObject)matin), &mat));
2955   PetscCall(MatSetSizes(mat, matin->rmap->n, matin->cmap->n, matin->rmap->N, matin->cmap->N));
2956   PetscCall(MatSetBlockSizesFromMats(mat, matin, matin));
2957   PetscCall(MatSetType(mat, ((PetscObject)matin)->type_name));
2958   a = (Mat_MPIAIJ *)mat->data;
2959 
2960   mat->factortype = matin->factortype;
2961   mat->assembled  = matin->assembled;
2962   mat->insertmode = NOT_SET_VALUES;
2963 
2964   a->size         = oldmat->size;
2965   a->rank         = oldmat->rank;
2966   a->donotstash   = oldmat->donotstash;
2967   a->roworiented  = oldmat->roworiented;
2968   a->rowindices   = NULL;
2969   a->rowvalues    = NULL;
2970   a->getrowactive = PETSC_FALSE;
2971 
2972   PetscCall(PetscLayoutReference(matin->rmap, &mat->rmap));
2973   PetscCall(PetscLayoutReference(matin->cmap, &mat->cmap));
2974   if (matin->hash_active) {
2975     PetscCall(MatSetUp(mat));
2976   } else {
2977     mat->preallocated = matin->preallocated;
2978     if (oldmat->colmap) {
2979 #if defined(PETSC_USE_CTABLE)
2980       PetscCall(PetscHMapIDuplicate(oldmat->colmap, &a->colmap));
2981 #else
2982       PetscCall(PetscMalloc1(mat->cmap->N, &a->colmap));
2983       PetscCall(PetscArraycpy(a->colmap, oldmat->colmap, mat->cmap->N));
2984 #endif
2985     } else a->colmap = NULL;
2986     if (oldmat->garray) {
2987       PetscInt len;
2988       len = oldmat->B->cmap->n;
2989       PetscCall(PetscMalloc1(len, &a->garray));
2990       if (len) PetscCall(PetscArraycpy(a->garray, oldmat->garray, len));
2991     } else a->garray = NULL;
2992 
2993     /* It may happen MatDuplicate is called with a non-assembled matrix
2994       In fact, MatDuplicate only requires the matrix to be preallocated
2995       This may happen inside a DMCreateMatrix_Shell */
2996     if (oldmat->lvec) PetscCall(VecDuplicate(oldmat->lvec, &a->lvec));
2997     if (oldmat->Mvctx) {
2998       a->Mvctx = oldmat->Mvctx;
2999       PetscCall(PetscObjectReference((PetscObject)oldmat->Mvctx));
3000     }
3001     PetscCall(MatDuplicate(oldmat->A, cpvalues, &a->A));
3002     PetscCall(MatDuplicate(oldmat->B, cpvalues, &a->B));
3003   }
3004   PetscCall(PetscFunctionListDuplicate(((PetscObject)matin)->qlist, &((PetscObject)mat)->qlist));
3005   *newmat = mat;
3006   PetscFunctionReturn(PETSC_SUCCESS);
3007 }
3008 
3009 PetscErrorCode MatLoad_MPIAIJ(Mat newMat, PetscViewer viewer)
3010 {
3011   PetscBool isbinary, ishdf5;
3012 
3013   PetscFunctionBegin;
3014   PetscValidHeaderSpecific(newMat, MAT_CLASSID, 1);
3015   PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
3016   /* force binary viewer to load .info file if it has not yet done so */
3017   PetscCall(PetscViewerSetUp(viewer));
3018   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
3019   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERHDF5, &ishdf5));
3020   if (isbinary) {
3021     PetscCall(MatLoad_MPIAIJ_Binary(newMat, viewer));
3022   } else if (ishdf5) {
3023 #if defined(PETSC_HAVE_HDF5)
3024     PetscCall(MatLoad_AIJ_HDF5(newMat, viewer));
3025 #else
3026     SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "HDF5 not supported in this build.\nPlease reconfigure using --download-hdf5");
3027 #endif
3028   } else {
3029     SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "Viewer type %s not yet supported for reading %s matrices", ((PetscObject)viewer)->type_name, ((PetscObject)newMat)->type_name);
3030   }
3031   PetscFunctionReturn(PETSC_SUCCESS);
3032 }
3033 
3034 PetscErrorCode MatLoad_MPIAIJ_Binary(Mat mat, PetscViewer viewer)
3035 {
3036   PetscInt     header[4], M, N, m, nz, rows, cols, sum, i;
3037   PetscInt    *rowidxs, *colidxs;
3038   PetscScalar *matvals;
3039 
3040   PetscFunctionBegin;
3041   PetscCall(PetscViewerSetUp(viewer));
3042 
3043   /* read in matrix header */
3044   PetscCall(PetscViewerBinaryRead(viewer, header, 4, NULL, PETSC_INT));
3045   PetscCheck(header[0] == MAT_FILE_CLASSID, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Not a matrix object in file");
3046   M  = header[1];
3047   N  = header[2];
3048   nz = header[3];
3049   PetscCheck(M >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix row size (%" PetscInt_FMT ") in file is negative", M);
3050   PetscCheck(N >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix column size (%" PetscInt_FMT ") in file is negative", N);
3051   PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix stored in special format on disk, cannot load as MPIAIJ");
3052 
3053   /* set block sizes from the viewer's .info file */
3054   PetscCall(MatLoad_Binary_BlockSizes(mat, viewer));
3055   /* set global sizes if not set already */
3056   if (mat->rmap->N < 0) mat->rmap->N = M;
3057   if (mat->cmap->N < 0) mat->cmap->N = N;
3058   PetscCall(PetscLayoutSetUp(mat->rmap));
3059   PetscCall(PetscLayoutSetUp(mat->cmap));
3060 
3061   /* check if the matrix sizes are correct */
3062   PetscCall(MatGetSize(mat, &rows, &cols));
3063   PetscCheck(M == rows && N == cols, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix in file of different sizes (%" PetscInt_FMT ", %" PetscInt_FMT ") than the input matrix (%" PetscInt_FMT ", %" PetscInt_FMT ")", M, N, rows, cols);
3064 
3065   /* read in row lengths and build row indices */
3066   PetscCall(MatGetLocalSize(mat, &m, NULL));
3067   PetscCall(PetscMalloc1(m + 1, &rowidxs));
3068   PetscCall(PetscViewerBinaryReadAll(viewer, rowidxs + 1, m, PETSC_DECIDE, M, PETSC_INT));
3069   rowidxs[0] = 0;
3070   for (i = 0; i < m; i++) rowidxs[i + 1] += rowidxs[i];
3071   if (nz != PETSC_INT_MAX) {
3072     PetscCallMPI(MPIU_Allreduce(&rowidxs[m], &sum, 1, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)viewer)));
3073     PetscCheck(sum == nz, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Inconsistent matrix data in file: nonzeros = %" PetscInt_FMT ", sum-row-lengths = %" PetscInt_FMT, nz, sum);
3074   }
3075 
3076   /* read in column indices and matrix values */
3077   PetscCall(PetscMalloc2(rowidxs[m], &colidxs, rowidxs[m], &matvals));
3078   PetscCall(PetscViewerBinaryReadAll(viewer, colidxs, rowidxs[m], PETSC_DETERMINE, PETSC_DETERMINE, PETSC_INT));
3079   PetscCall(PetscViewerBinaryReadAll(viewer, matvals, rowidxs[m], PETSC_DETERMINE, PETSC_DETERMINE, PETSC_SCALAR));
3080   /* store matrix indices and values */
3081   PetscCall(MatMPIAIJSetPreallocationCSR(mat, rowidxs, colidxs, matvals));
3082   PetscCall(PetscFree(rowidxs));
3083   PetscCall(PetscFree2(colidxs, matvals));
3084   PetscFunctionReturn(PETSC_SUCCESS);
3085 }
3086 
3087 /* Not scalable because of ISAllGather() unless getting all columns. */
3088 static PetscErrorCode ISGetSeqIS_Private(Mat mat, IS iscol, IS *isseq)
3089 {
3090   IS          iscol_local;
3091   PetscBool   isstride;
3092   PetscMPIInt gisstride = 0;
3093 
3094   PetscFunctionBegin;
3095   /* check if we are grabbing all columns*/
3096   PetscCall(PetscObjectTypeCompare((PetscObject)iscol, ISSTRIDE, &isstride));
3097 
3098   if (isstride) {
3099     PetscInt start, len, mstart, mlen;
3100     PetscCall(ISStrideGetInfo(iscol, &start, NULL));
3101     PetscCall(ISGetLocalSize(iscol, &len));
3102     PetscCall(MatGetOwnershipRangeColumn(mat, &mstart, &mlen));
3103     if (mstart == start && mlen - mstart == len) gisstride = 1;
3104   }
3105 
3106   PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &gisstride, 1, MPI_INT, MPI_MIN, PetscObjectComm((PetscObject)mat)));
3107   if (gisstride) {
3108     PetscInt N;
3109     PetscCall(MatGetSize(mat, NULL, &N));
3110     PetscCall(ISCreateStride(PETSC_COMM_SELF, N, 0, 1, &iscol_local));
3111     PetscCall(ISSetIdentity(iscol_local));
3112     PetscCall(PetscInfo(mat, "Optimizing for obtaining all columns of the matrix; skipping ISAllGather()\n"));
3113   } else {
3114     PetscInt cbs;
3115     PetscCall(ISGetBlockSize(iscol, &cbs));
3116     PetscCall(ISAllGather(iscol, &iscol_local));
3117     PetscCall(ISSetBlockSize(iscol_local, cbs));
3118   }
3119 
3120   *isseq = iscol_local;
3121   PetscFunctionReturn(PETSC_SUCCESS);
3122 }
3123 
3124 /*
3125  Used by MatCreateSubMatrix_MPIAIJ_SameRowColDist() to avoid ISAllGather() and global size of iscol_local
3126  (see MatCreateSubMatrix_MPIAIJ_nonscalable)
3127 
3128  Input Parameters:
3129 +   mat - matrix
3130 .   isrow - parallel row index set; its local indices are a subset of local columns of `mat`,
3131            i.e., mat->rstart <= isrow[i] < mat->rend
3132 -   iscol - parallel column index set; its local indices are a subset of local columns of `mat`,
3133            i.e., mat->cstart <= iscol[i] < mat->cend
3134 
3135  Output Parameters:
3136 +   isrow_d - sequential row index set for retrieving mat->A
3137 .   iscol_d - sequential  column index set for retrieving mat->A
3138 .   iscol_o - sequential column index set for retrieving mat->B
3139 -   garray - column map; garray[i] indicates global location of iscol_o[i] in `iscol`
3140  */
3141 static PetscErrorCode ISGetSeqIS_SameColDist_Private(Mat mat, IS isrow, IS iscol, IS *isrow_d, IS *iscol_d, IS *iscol_o, PetscInt *garray[])
3142 {
3143   Vec             x, cmap;
3144   const PetscInt *is_idx;
3145   PetscScalar    *xarray, *cmaparray;
3146   PetscInt        ncols, isstart, *idx, m, rstart, *cmap1, count;
3147   Mat_MPIAIJ     *a    = (Mat_MPIAIJ *)mat->data;
3148   Mat             B    = a->B;
3149   Vec             lvec = a->lvec, lcmap;
3150   PetscInt        i, cstart, cend, Bn = B->cmap->N;
3151   MPI_Comm        comm;
3152   VecScatter      Mvctx = a->Mvctx;
3153 
3154   PetscFunctionBegin;
3155   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3156   PetscCall(ISGetLocalSize(iscol, &ncols));
3157 
3158   /* (1) iscol is a sub-column vector of mat, pad it with '-1.' to form a full vector x */
3159   PetscCall(MatCreateVecs(mat, &x, NULL));
3160   PetscCall(VecSet(x, -1.0));
3161   PetscCall(VecDuplicate(x, &cmap));
3162   PetscCall(VecSet(cmap, -1.0));
3163 
3164   /* Get start indices */
3165   PetscCallMPI(MPI_Scan(&ncols, &isstart, 1, MPIU_INT, MPI_SUM, comm));
3166   isstart -= ncols;
3167   PetscCall(MatGetOwnershipRangeColumn(mat, &cstart, &cend));
3168 
3169   PetscCall(ISGetIndices(iscol, &is_idx));
3170   PetscCall(VecGetArray(x, &xarray));
3171   PetscCall(VecGetArray(cmap, &cmaparray));
3172   PetscCall(PetscMalloc1(ncols, &idx));
3173   for (i = 0; i < ncols; i++) {
3174     xarray[is_idx[i] - cstart]    = (PetscScalar)is_idx[i];
3175     cmaparray[is_idx[i] - cstart] = i + isstart;        /* global index of iscol[i] */
3176     idx[i]                        = is_idx[i] - cstart; /* local index of iscol[i]  */
3177   }
3178   PetscCall(VecRestoreArray(x, &xarray));
3179   PetscCall(VecRestoreArray(cmap, &cmaparray));
3180   PetscCall(ISRestoreIndices(iscol, &is_idx));
3181 
3182   /* Get iscol_d */
3183   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, iscol_d));
3184   PetscCall(ISGetBlockSize(iscol, &i));
3185   PetscCall(ISSetBlockSize(*iscol_d, i));
3186 
3187   /* Get isrow_d */
3188   PetscCall(ISGetLocalSize(isrow, &m));
3189   rstart = mat->rmap->rstart;
3190   PetscCall(PetscMalloc1(m, &idx));
3191   PetscCall(ISGetIndices(isrow, &is_idx));
3192   for (i = 0; i < m; i++) idx[i] = is_idx[i] - rstart;
3193   PetscCall(ISRestoreIndices(isrow, &is_idx));
3194 
3195   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, m, idx, PETSC_OWN_POINTER, isrow_d));
3196   PetscCall(ISGetBlockSize(isrow, &i));
3197   PetscCall(ISSetBlockSize(*isrow_d, i));
3198 
3199   /* (2) Scatter x and cmap using aij->Mvctx to get their off-process portions (see MatMult_MPIAIJ) */
3200   PetscCall(VecScatterBegin(Mvctx, x, lvec, INSERT_VALUES, SCATTER_FORWARD));
3201   PetscCall(VecScatterEnd(Mvctx, x, lvec, INSERT_VALUES, SCATTER_FORWARD));
3202 
3203   PetscCall(VecDuplicate(lvec, &lcmap));
3204 
3205   PetscCall(VecScatterBegin(Mvctx, cmap, lcmap, INSERT_VALUES, SCATTER_FORWARD));
3206   PetscCall(VecScatterEnd(Mvctx, cmap, lcmap, INSERT_VALUES, SCATTER_FORWARD));
3207 
3208   /* (3) create sequential iscol_o (a subset of iscol) and isgarray */
3209   /* off-process column indices */
3210   count = 0;
3211   PetscCall(PetscMalloc1(Bn, &idx));
3212   PetscCall(PetscMalloc1(Bn, &cmap1));
3213 
3214   PetscCall(VecGetArray(lvec, &xarray));
3215   PetscCall(VecGetArray(lcmap, &cmaparray));
3216   for (i = 0; i < Bn; i++) {
3217     if (PetscRealPart(xarray[i]) > -1.0) {
3218       idx[count]   = i;                                     /* local column index in off-diagonal part B */
3219       cmap1[count] = (PetscInt)PetscRealPart(cmaparray[i]); /* column index in submat */
3220       count++;
3221     }
3222   }
3223   PetscCall(VecRestoreArray(lvec, &xarray));
3224   PetscCall(VecRestoreArray(lcmap, &cmaparray));
3225 
3226   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, idx, PETSC_COPY_VALUES, iscol_o));
3227   /* cannot ensure iscol_o has same blocksize as iscol! */
3228 
3229   PetscCall(PetscFree(idx));
3230   *garray = cmap1;
3231 
3232   PetscCall(VecDestroy(&x));
3233   PetscCall(VecDestroy(&cmap));
3234   PetscCall(VecDestroy(&lcmap));
3235   PetscFunctionReturn(PETSC_SUCCESS);
3236 }
3237 
3238 /* isrow and iscol have same processor distribution as mat, output *submat is a submatrix of local mat */
3239 PetscErrorCode MatCreateSubMatrix_MPIAIJ_SameRowColDist(Mat mat, IS isrow, IS iscol, MatReuse call, Mat *submat)
3240 {
3241   Mat_MPIAIJ *a = (Mat_MPIAIJ *)mat->data, *asub;
3242   Mat         M = NULL;
3243   MPI_Comm    comm;
3244   IS          iscol_d, isrow_d, iscol_o;
3245   Mat         Asub = NULL, Bsub = NULL;
3246   PetscInt    n, count, M_size, N_size;
3247 
3248   PetscFunctionBegin;
3249   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3250 
3251   if (call == MAT_REUSE_MATRIX) {
3252     /* Retrieve isrow_d, iscol_d and iscol_o from submat */
3253     PetscCall(PetscObjectQuery((PetscObject)*submat, "isrow_d", (PetscObject *)&isrow_d));
3254     PetscCheck(isrow_d, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "isrow_d passed in was not used before, cannot reuse");
3255 
3256     PetscCall(PetscObjectQuery((PetscObject)*submat, "iscol_d", (PetscObject *)&iscol_d));
3257     PetscCheck(iscol_d, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_d passed in was not used before, cannot reuse");
3258 
3259     PetscCall(PetscObjectQuery((PetscObject)*submat, "iscol_o", (PetscObject *)&iscol_o));
3260     PetscCheck(iscol_o, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_o passed in was not used before, cannot reuse");
3261 
3262     /* Update diagonal and off-diagonal portions of submat */
3263     asub = (Mat_MPIAIJ *)(*submat)->data;
3264     PetscCall(MatCreateSubMatrix_SeqAIJ(a->A, isrow_d, iscol_d, PETSC_DECIDE, MAT_REUSE_MATRIX, &asub->A));
3265     PetscCall(ISGetLocalSize(iscol_o, &n));
3266     if (n) PetscCall(MatCreateSubMatrix_SeqAIJ(a->B, isrow_d, iscol_o, PETSC_DECIDE, MAT_REUSE_MATRIX, &asub->B));
3267     PetscCall(MatAssemblyBegin(*submat, MAT_FINAL_ASSEMBLY));
3268     PetscCall(MatAssemblyEnd(*submat, MAT_FINAL_ASSEMBLY));
3269 
3270   } else { /* call == MAT_INITIAL_MATRIX) */
3271     PetscInt *garray, *garray_compact;
3272     PetscInt  BsubN;
3273 
3274     /* Create isrow_d, iscol_d, iscol_o and isgarray (replace isgarray with array?) */
3275     PetscCall(ISGetSeqIS_SameColDist_Private(mat, isrow, iscol, &isrow_d, &iscol_d, &iscol_o, &garray));
3276 
3277     /* Create local submatrices Asub and Bsub */
3278     PetscCall(MatCreateSubMatrix_SeqAIJ(a->A, isrow_d, iscol_d, PETSC_DECIDE, MAT_INITIAL_MATRIX, &Asub));
3279     PetscCall(MatCreateSubMatrix_SeqAIJ(a->B, isrow_d, iscol_o, PETSC_DECIDE, MAT_INITIAL_MATRIX, &Bsub));
3280 
3281     // Compact garray so its not of size Bn
3282     PetscCall(ISGetSize(iscol_o, &count));
3283     PetscCall(PetscMalloc1(count, &garray_compact));
3284     PetscCall(PetscArraycpy(garray_compact, garray, count));
3285 
3286     /* Create submatrix M */
3287     PetscCall(ISGetSize(isrow, &M_size));
3288     PetscCall(ISGetSize(iscol, &N_size));
3289     PetscCall(MatCreateMPIAIJWithSeqAIJ(comm, M_size, N_size, Asub, Bsub, garray_compact, &M));
3290 
3291     /* If Bsub has empty columns, compress iscol_o such that it will retrieve condensed Bsub from a->B during reuse */
3292     asub = (Mat_MPIAIJ *)M->data;
3293 
3294     PetscCall(ISGetLocalSize(iscol_o, &BsubN));
3295     n = asub->B->cmap->N;
3296     if (BsubN > n) {
3297       /* This case can be tested using ~petsc/src/tao/bound/tutorials/runplate2_3 */
3298       const PetscInt *idx;
3299       PetscInt        i, j, *idx_new, *subgarray = asub->garray;
3300       PetscCall(PetscInfo(M, "submatrix Bn %" PetscInt_FMT " != BsubN %" PetscInt_FMT ", update iscol_o\n", n, BsubN));
3301 
3302       PetscCall(PetscMalloc1(n, &idx_new));
3303       j = 0;
3304       PetscCall(ISGetIndices(iscol_o, &idx));
3305       for (i = 0; i < n; i++) {
3306         if (j >= BsubN) break;
3307         while (subgarray[i] > garray[j]) j++;
3308 
3309         PetscCheck(subgarray[i] == garray[j], PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "subgarray[%" PetscInt_FMT "]=%" PetscInt_FMT " cannot < garray[%" PetscInt_FMT "]=%" PetscInt_FMT, i, subgarray[i], j, garray[j]);
3310         idx_new[i] = idx[j++];
3311       }
3312       PetscCall(ISRestoreIndices(iscol_o, &idx));
3313 
3314       PetscCall(ISDestroy(&iscol_o));
3315       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, idx_new, PETSC_OWN_POINTER, &iscol_o));
3316 
3317     } else PetscCheck(BsubN >= n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Columns of Bsub (%" PetscInt_FMT ") cannot be smaller than B's (%" PetscInt_FMT ")", BsubN, asub->B->cmap->N);
3318 
3319     PetscCall(PetscFree(garray));
3320     *submat = M;
3321 
3322     /* Save isrow_d, iscol_d and iscol_o used in processor for next request */
3323     PetscCall(PetscObjectCompose((PetscObject)M, "isrow_d", (PetscObject)isrow_d));
3324     PetscCall(ISDestroy(&isrow_d));
3325 
3326     PetscCall(PetscObjectCompose((PetscObject)M, "iscol_d", (PetscObject)iscol_d));
3327     PetscCall(ISDestroy(&iscol_d));
3328 
3329     PetscCall(PetscObjectCompose((PetscObject)M, "iscol_o", (PetscObject)iscol_o));
3330     PetscCall(ISDestroy(&iscol_o));
3331   }
3332   PetscFunctionReturn(PETSC_SUCCESS);
3333 }
3334 
3335 PetscErrorCode MatCreateSubMatrix_MPIAIJ(Mat mat, IS isrow, IS iscol, MatReuse call, Mat *newmat)
3336 {
3337   IS        iscol_local = NULL, isrow_d;
3338   PetscInt  csize;
3339   PetscInt  n, i, j, start, end;
3340   PetscBool sameRowDist = PETSC_FALSE, sameDist[2], tsameDist[2];
3341   MPI_Comm  comm;
3342 
3343   PetscFunctionBegin;
3344   /* If isrow has same processor distribution as mat,
3345      call MatCreateSubMatrix_MPIAIJ_SameRowDist() to avoid using a hash table with global size of iscol */
3346   if (call == MAT_REUSE_MATRIX) {
3347     PetscCall(PetscObjectQuery((PetscObject)*newmat, "isrow_d", (PetscObject *)&isrow_d));
3348     if (isrow_d) {
3349       sameRowDist  = PETSC_TRUE;
3350       tsameDist[1] = PETSC_TRUE; /* sameColDist */
3351     } else {
3352       PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_local));
3353       if (iscol_local) {
3354         sameRowDist  = PETSC_TRUE;
3355         tsameDist[1] = PETSC_FALSE; /* !sameColDist */
3356       }
3357     }
3358   } else {
3359     /* Check if isrow has same processor distribution as mat */
3360     sameDist[0] = PETSC_FALSE;
3361     PetscCall(ISGetLocalSize(isrow, &n));
3362     if (!n) {
3363       sameDist[0] = PETSC_TRUE;
3364     } else {
3365       PetscCall(ISGetMinMax(isrow, &i, &j));
3366       PetscCall(MatGetOwnershipRange(mat, &start, &end));
3367       if (i >= start && j < end) sameDist[0] = PETSC_TRUE;
3368     }
3369 
3370     /* Check if iscol has same processor distribution as mat */
3371     sameDist[1] = PETSC_FALSE;
3372     PetscCall(ISGetLocalSize(iscol, &n));
3373     if (!n) {
3374       sameDist[1] = PETSC_TRUE;
3375     } else {
3376       PetscCall(ISGetMinMax(iscol, &i, &j));
3377       PetscCall(MatGetOwnershipRangeColumn(mat, &start, &end));
3378       if (i >= start && j < end) sameDist[1] = PETSC_TRUE;
3379     }
3380 
3381     PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3382     PetscCallMPI(MPIU_Allreduce(&sameDist, &tsameDist, 2, MPI_C_BOOL, MPI_LAND, comm));
3383     sameRowDist = tsameDist[0];
3384   }
3385 
3386   if (sameRowDist) {
3387     if (tsameDist[1]) { /* sameRowDist & sameColDist */
3388       /* isrow and iscol have same processor distribution as mat */
3389       PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowColDist(mat, isrow, iscol, call, newmat));
3390       PetscFunctionReturn(PETSC_SUCCESS);
3391     } else { /* sameRowDist */
3392       /* isrow has same processor distribution as mat */
3393       if (call == MAT_INITIAL_MATRIX) {
3394         PetscBool sorted;
3395         PetscCall(ISGetSeqIS_Private(mat, iscol, &iscol_local));
3396         PetscCall(ISGetLocalSize(iscol_local, &n)); /* local size of iscol_local = global columns of newmat */
3397         PetscCall(ISGetSize(iscol, &i));
3398         PetscCheck(n == i, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "n %" PetscInt_FMT " != size of iscol %" PetscInt_FMT, n, i);
3399 
3400         PetscCall(ISSorted(iscol_local, &sorted));
3401         if (sorted) {
3402           /* MatCreateSubMatrix_MPIAIJ_SameRowDist() requires iscol_local be sorted; it can have duplicate indices */
3403           PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowDist(mat, isrow, iscol, iscol_local, MAT_INITIAL_MATRIX, newmat));
3404           PetscFunctionReturn(PETSC_SUCCESS);
3405         }
3406       } else { /* call == MAT_REUSE_MATRIX */
3407         IS iscol_sub;
3408         PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_sub));
3409         if (iscol_sub) {
3410           PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowDist(mat, isrow, iscol, NULL, call, newmat));
3411           PetscFunctionReturn(PETSC_SUCCESS);
3412         }
3413       }
3414     }
3415   }
3416 
3417   /* General case: iscol -> iscol_local which has global size of iscol */
3418   if (call == MAT_REUSE_MATRIX) {
3419     PetscCall(PetscObjectQuery((PetscObject)*newmat, "ISAllGather", (PetscObject *)&iscol_local));
3420     PetscCheck(iscol_local, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3421   } else {
3422     if (!iscol_local) PetscCall(ISGetSeqIS_Private(mat, iscol, &iscol_local));
3423   }
3424 
3425   PetscCall(ISGetLocalSize(iscol, &csize));
3426   PetscCall(MatCreateSubMatrix_MPIAIJ_nonscalable(mat, isrow, iscol_local, csize, call, newmat));
3427 
3428   if (call == MAT_INITIAL_MATRIX) {
3429     PetscCall(PetscObjectCompose((PetscObject)*newmat, "ISAllGather", (PetscObject)iscol_local));
3430     PetscCall(ISDestroy(&iscol_local));
3431   }
3432   PetscFunctionReturn(PETSC_SUCCESS);
3433 }
3434 
3435 /*@C
3436   MatCreateMPIAIJWithSeqAIJ - creates a `MATMPIAIJ` matrix using `MATSEQAIJ` matrices that contain the "diagonal"
3437   and "off-diagonal" part of the matrix in CSR format.
3438 
3439   Collective
3440 
3441   Input Parameters:
3442 + comm   - MPI communicator
3443 . M      - the global row size
3444 . N      - the global column size
3445 . A      - "diagonal" portion of matrix
3446 . B      - if garray is `NULL`, B should be the offdiag matrix using global col ids and of size N - if garray is not `NULL`, B should be the offdiag matrix using local col ids and of size garray
3447 - garray - either `NULL` or the global index of `B` columns. If not `NULL`, it should be allocated by `PetscMalloc1()` and will be owned by `mat` thereafter.
3448 
3449   Output Parameter:
3450 . mat - the matrix, with input `A` as its local diagonal matrix
3451 
3452   Level: advanced
3453 
3454   Notes:
3455   See `MatCreateAIJ()` for the definition of "diagonal" and "off-diagonal" portion of the matrix.
3456 
3457   `A` and `B` becomes part of output mat. The user cannot use `A` and `B` anymore.
3458 
3459   If `garray` is `NULL`, `B` will be compacted to use local indices. In this sense, `B`'s sparsity pattern (nonzerostate) will be changed. If `B` is a device matrix, we need to somehow also update
3460   `B`'s copy on device.  We do so by increasing `B`'s nonzerostate. In use of `B` on device, device matrix types should detect this change (ref. internal routines `MatSeqAIJCUSPARSECopyToGPU()` or
3461   `MatAssemblyEnd_SeqAIJKokkos()`) and will just destroy and then recreate the device copy of `B`. It is not optimal, but is easy to implement and less hacky. To avoid this overhead, try to compute `garray`
3462   yourself, see algorithms in the private function `MatSetUpMultiply_MPIAIJ()`.
3463 
3464   The `NULL`-ness of `garray` doesn't need to be collective, in other words, `garray` can be `NULL` on some processes while not on others.
3465 
3466 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MATSEQAIJ`, `MatCreateMPIAIJWithSplitArrays()`
3467 @*/
3468 PetscErrorCode MatCreateMPIAIJWithSeqAIJ(MPI_Comm comm, PetscInt M, PetscInt N, Mat A, Mat B, PetscInt *garray, Mat *mat)
3469 {
3470   PetscInt    m, n;
3471   MatType     mpi_mat_type;
3472   Mat_MPIAIJ *mpiaij;
3473   Mat         C;
3474 
3475   PetscFunctionBegin;
3476   PetscCall(MatCreate(comm, &C));
3477   PetscCall(MatGetSize(A, &m, &n));
3478   PetscCheck(m == B->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Am %" PetscInt_FMT " != Bm %" PetscInt_FMT, m, B->rmap->N);
3479   PetscCheck(A->rmap->bs == B->rmap->bs, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "A row bs %" PetscInt_FMT " != B row bs %" PetscInt_FMT, A->rmap->bs, B->rmap->bs);
3480 
3481   PetscCall(MatSetSizes(C, m, n, M, N));
3482   /* Determine the type of MPI matrix that should be created from the type of matrix A, which holds the "diagonal" portion. */
3483   PetscCall(MatGetMPIMatType_Private(A, &mpi_mat_type));
3484   PetscCall(MatSetType(C, mpi_mat_type));
3485   if (!garray) {
3486     const PetscScalar *ba;
3487 
3488     B->nonzerostate++;
3489     PetscCall(MatSeqAIJGetArrayRead(B, &ba)); /* Since we will destroy B's device copy, we need to make sure the host copy is up to date */
3490     PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
3491   }
3492 
3493   PetscCall(MatSetBlockSizes(C, A->rmap->bs, A->cmap->bs));
3494   PetscCall(PetscLayoutSetUp(C->rmap));
3495   PetscCall(PetscLayoutSetUp(C->cmap));
3496 
3497   mpiaij              = (Mat_MPIAIJ *)C->data;
3498   mpiaij->A           = A;
3499   mpiaij->B           = B;
3500   mpiaij->garray      = garray;
3501   C->preallocated     = PETSC_TRUE;
3502   C->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */
3503 
3504   PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
3505   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
3506   /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and
3507    also gets mpiaij->B compacted (if garray is NULL), with its col ids and size reduced
3508    */
3509   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
3510   PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE));
3511   PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3512   *mat = C;
3513   PetscFunctionReturn(PETSC_SUCCESS);
3514 }
3515 
3516 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, PetscBool, Mat *);
3517 
3518 PetscErrorCode MatCreateSubMatrix_MPIAIJ_SameRowDist(Mat mat, IS isrow, IS iscol, IS iscol_local, MatReuse call, Mat *newmat)
3519 {
3520   PetscInt        i, m, n, rstart, row, rend, nz, j, bs, cbs;
3521   PetscInt       *ii, *jj, nlocal, *dlens, *olens, dlen, olen, jend, mglobal;
3522   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)mat->data;
3523   Mat             M, Msub, B = a->B;
3524   MatScalar      *aa;
3525   Mat_SeqAIJ     *aij;
3526   PetscInt       *garray = a->garray, *colsub, Ncols;
3527   PetscInt        count, Bn = B->cmap->N, cstart = mat->cmap->rstart, cend = mat->cmap->rend;
3528   IS              iscol_sub, iscmap;
3529   const PetscInt *is_idx, *cmap;
3530   PetscBool       allcolumns = PETSC_FALSE;
3531   MPI_Comm        comm;
3532 
3533   PetscFunctionBegin;
3534   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3535   if (call == MAT_REUSE_MATRIX) {
3536     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_sub));
3537     PetscCheck(iscol_sub, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "SubIScol passed in was not used before, cannot reuse");
3538     PetscCall(ISGetLocalSize(iscol_sub, &count));
3539 
3540     PetscCall(PetscObjectQuery((PetscObject)*newmat, "Subcmap", (PetscObject *)&iscmap));
3541     PetscCheck(iscmap, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Subcmap passed in was not used before, cannot reuse");
3542 
3543     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubMatrix", (PetscObject *)&Msub));
3544     PetscCheck(Msub, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3545 
3546     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol_sub, MAT_REUSE_MATRIX, PETSC_FALSE, &Msub));
3547 
3548   } else { /* call == MAT_INITIAL_MATRIX) */
3549     PetscBool flg;
3550 
3551     PetscCall(ISGetLocalSize(iscol, &n));
3552     PetscCall(ISGetSize(iscol, &Ncols));
3553 
3554     /* (1) iscol -> nonscalable iscol_local */
3555     /* Check for special case: each processor gets entire matrix columns */
3556     PetscCall(ISIdentity(iscol_local, &flg));
3557     if (flg && n == mat->cmap->N) allcolumns = PETSC_TRUE;
3558     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &allcolumns, 1, MPI_C_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
3559     if (allcolumns) {
3560       iscol_sub = iscol_local;
3561       PetscCall(PetscObjectReference((PetscObject)iscol_local));
3562       PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &iscmap));
3563 
3564     } else {
3565       /* (2) iscol_local -> iscol_sub and iscmap. Implementation below requires iscol_local be sorted, it can have duplicate indices */
3566       PetscInt *idx, *cmap1, k;
3567       PetscCall(PetscMalloc1(Ncols, &idx));
3568       PetscCall(PetscMalloc1(Ncols, &cmap1));
3569       PetscCall(ISGetIndices(iscol_local, &is_idx));
3570       count = 0;
3571       k     = 0;
3572       for (i = 0; i < Ncols; i++) {
3573         j = is_idx[i];
3574         if (j >= cstart && j < cend) {
3575           /* diagonal part of mat */
3576           idx[count]     = j;
3577           cmap1[count++] = i; /* column index in submat */
3578         } else if (Bn) {
3579           /* off-diagonal part of mat */
3580           if (j == garray[k]) {
3581             idx[count]     = j;
3582             cmap1[count++] = i; /* column index in submat */
3583           } else if (j > garray[k]) {
3584             while (j > garray[k] && k < Bn - 1) k++;
3585             if (j == garray[k]) {
3586               idx[count]     = j;
3587               cmap1[count++] = i; /* column index in submat */
3588             }
3589           }
3590         }
3591       }
3592       PetscCall(ISRestoreIndices(iscol_local, &is_idx));
3593 
3594       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, idx, PETSC_OWN_POINTER, &iscol_sub));
3595       PetscCall(ISGetBlockSize(iscol, &cbs));
3596       PetscCall(ISSetBlockSize(iscol_sub, cbs));
3597 
3598       PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)iscol_local), count, cmap1, PETSC_OWN_POINTER, &iscmap));
3599     }
3600 
3601     /* (3) Create sequential Msub */
3602     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol_sub, MAT_INITIAL_MATRIX, allcolumns, &Msub));
3603   }
3604 
3605   PetscCall(ISGetLocalSize(iscol_sub, &count));
3606   aij = (Mat_SeqAIJ *)Msub->data;
3607   ii  = aij->i;
3608   PetscCall(ISGetIndices(iscmap, &cmap));
3609 
3610   /*
3611       m - number of local rows
3612       Ncols - number of columns (same on all processors)
3613       rstart - first row in new global matrix generated
3614   */
3615   PetscCall(MatGetSize(Msub, &m, NULL));
3616 
3617   if (call == MAT_INITIAL_MATRIX) {
3618     /* (4) Create parallel newmat */
3619     PetscMPIInt rank, size;
3620     PetscInt    csize;
3621 
3622     PetscCallMPI(MPI_Comm_size(comm, &size));
3623     PetscCallMPI(MPI_Comm_rank(comm, &rank));
3624 
3625     /*
3626         Determine the number of non-zeros in the diagonal and off-diagonal
3627         portions of the matrix in order to do correct preallocation
3628     */
3629 
3630     /* first get start and end of "diagonal" columns */
3631     PetscCall(ISGetLocalSize(iscol, &csize));
3632     if (csize == PETSC_DECIDE) {
3633       PetscCall(ISGetSize(isrow, &mglobal));
3634       if (mglobal == Ncols) { /* square matrix */
3635         nlocal = m;
3636       } else {
3637         nlocal = Ncols / size + ((Ncols % size) > rank);
3638       }
3639     } else {
3640       nlocal = csize;
3641     }
3642     PetscCallMPI(MPI_Scan(&nlocal, &rend, 1, MPIU_INT, MPI_SUM, comm));
3643     rstart = rend - nlocal;
3644     PetscCheck(rank != size - 1 || rend == Ncols, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Local column sizes %" PetscInt_FMT " do not add up to total number of columns %" PetscInt_FMT, rend, Ncols);
3645 
3646     /* next, compute all the lengths */
3647     jj = aij->j;
3648     PetscCall(PetscMalloc1(2 * m + 1, &dlens));
3649     olens = dlens + m;
3650     for (i = 0; i < m; i++) {
3651       jend = ii[i + 1] - ii[i];
3652       olen = 0;
3653       dlen = 0;
3654       for (j = 0; j < jend; j++) {
3655         if (cmap[*jj] < rstart || cmap[*jj] >= rend) olen++;
3656         else dlen++;
3657         jj++;
3658       }
3659       olens[i] = olen;
3660       dlens[i] = dlen;
3661     }
3662 
3663     PetscCall(ISGetBlockSize(isrow, &bs));
3664     PetscCall(ISGetBlockSize(iscol, &cbs));
3665 
3666     PetscCall(MatCreate(comm, &M));
3667     PetscCall(MatSetSizes(M, m, nlocal, PETSC_DECIDE, Ncols));
3668     PetscCall(MatSetBlockSizes(M, bs, cbs));
3669     PetscCall(MatSetType(M, ((PetscObject)mat)->type_name));
3670     PetscCall(MatMPIAIJSetPreallocation(M, 0, dlens, 0, olens));
3671     PetscCall(PetscFree(dlens));
3672 
3673   } else { /* call == MAT_REUSE_MATRIX */
3674     M = *newmat;
3675     PetscCall(MatGetLocalSize(M, &i, NULL));
3676     PetscCheck(i == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Previous matrix must be same size/layout as request");
3677     PetscCall(MatZeroEntries(M));
3678     /*
3679          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
3680        rather than the slower MatSetValues().
3681     */
3682     M->was_assembled = PETSC_TRUE;
3683     M->assembled     = PETSC_FALSE;
3684   }
3685 
3686   /* (5) Set values of Msub to *newmat */
3687   PetscCall(PetscMalloc1(count, &colsub));
3688   PetscCall(MatGetOwnershipRange(M, &rstart, NULL));
3689 
3690   jj = aij->j;
3691   PetscCall(MatSeqAIJGetArrayRead(Msub, (const PetscScalar **)&aa));
3692   for (i = 0; i < m; i++) {
3693     row = rstart + i;
3694     nz  = ii[i + 1] - ii[i];
3695     for (j = 0; j < nz; j++) colsub[j] = cmap[jj[j]];
3696     PetscCall(MatSetValues_MPIAIJ(M, 1, &row, nz, colsub, aa, INSERT_VALUES));
3697     jj += nz;
3698     aa += nz;
3699   }
3700   PetscCall(MatSeqAIJRestoreArrayRead(Msub, (const PetscScalar **)&aa));
3701   PetscCall(ISRestoreIndices(iscmap, &cmap));
3702 
3703   PetscCall(MatAssemblyBegin(M, MAT_FINAL_ASSEMBLY));
3704   PetscCall(MatAssemblyEnd(M, MAT_FINAL_ASSEMBLY));
3705 
3706   PetscCall(PetscFree(colsub));
3707 
3708   /* save Msub, iscol_sub and iscmap used in processor for next request */
3709   if (call == MAT_INITIAL_MATRIX) {
3710     *newmat = M;
3711     PetscCall(PetscObjectCompose((PetscObject)*newmat, "SubMatrix", (PetscObject)Msub));
3712     PetscCall(MatDestroy(&Msub));
3713 
3714     PetscCall(PetscObjectCompose((PetscObject)*newmat, "SubIScol", (PetscObject)iscol_sub));
3715     PetscCall(ISDestroy(&iscol_sub));
3716 
3717     PetscCall(PetscObjectCompose((PetscObject)*newmat, "Subcmap", (PetscObject)iscmap));
3718     PetscCall(ISDestroy(&iscmap));
3719 
3720     if (iscol_local) {
3721       PetscCall(PetscObjectCompose((PetscObject)*newmat, "ISAllGather", (PetscObject)iscol_local));
3722       PetscCall(ISDestroy(&iscol_local));
3723     }
3724   }
3725   PetscFunctionReturn(PETSC_SUCCESS);
3726 }
3727 
3728 /*
3729     Not great since it makes two copies of the submatrix, first an SeqAIJ
3730   in local and then by concatenating the local matrices the end result.
3731   Writing it directly would be much like MatCreateSubMatrices_MPIAIJ()
3732 
3733   This requires a sequential iscol with all indices.
3734 */
3735 PetscErrorCode MatCreateSubMatrix_MPIAIJ_nonscalable(Mat mat, IS isrow, IS iscol, PetscInt csize, MatReuse call, Mat *newmat)
3736 {
3737   PetscMPIInt rank, size;
3738   PetscInt    i, m, n, rstart, row, rend, nz, *cwork, j, bs, cbs;
3739   PetscInt   *ii, *jj, nlocal, *dlens, *olens, dlen, olen, jend, mglobal;
3740   Mat         M, Mreuse;
3741   MatScalar  *aa, *vwork;
3742   MPI_Comm    comm;
3743   Mat_SeqAIJ *aij;
3744   PetscBool   colflag, allcolumns = PETSC_FALSE;
3745 
3746   PetscFunctionBegin;
3747   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3748   PetscCallMPI(MPI_Comm_rank(comm, &rank));
3749   PetscCallMPI(MPI_Comm_size(comm, &size));
3750 
3751   /* Check for special case: each processor gets entire matrix columns */
3752   PetscCall(ISIdentity(iscol, &colflag));
3753   PetscCall(ISGetLocalSize(iscol, &n));
3754   if (colflag && n == mat->cmap->N) allcolumns = PETSC_TRUE;
3755   PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &allcolumns, 1, MPI_C_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
3756 
3757   if (call == MAT_REUSE_MATRIX) {
3758     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubMatrix", (PetscObject *)&Mreuse));
3759     PetscCheck(Mreuse, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3760     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol, MAT_REUSE_MATRIX, allcolumns, &Mreuse));
3761   } else {
3762     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol, MAT_INITIAL_MATRIX, allcolumns, &Mreuse));
3763   }
3764 
3765   /*
3766       m - number of local rows
3767       n - number of columns (same on all processors)
3768       rstart - first row in new global matrix generated
3769   */
3770   PetscCall(MatGetSize(Mreuse, &m, &n));
3771   PetscCall(MatGetBlockSizes(Mreuse, &bs, &cbs));
3772   if (call == MAT_INITIAL_MATRIX) {
3773     aij = (Mat_SeqAIJ *)Mreuse->data;
3774     ii  = aij->i;
3775     jj  = aij->j;
3776 
3777     /*
3778         Determine the number of non-zeros in the diagonal and off-diagonal
3779         portions of the matrix in order to do correct preallocation
3780     */
3781 
3782     /* first get start and end of "diagonal" columns */
3783     if (csize == PETSC_DECIDE) {
3784       PetscCall(ISGetSize(isrow, &mglobal));
3785       if (mglobal == n) { /* square matrix */
3786         nlocal = m;
3787       } else {
3788         nlocal = n / size + ((n % size) > rank);
3789       }
3790     } else {
3791       nlocal = csize;
3792     }
3793     PetscCallMPI(MPI_Scan(&nlocal, &rend, 1, MPIU_INT, MPI_SUM, comm));
3794     rstart = rend - nlocal;
3795     PetscCheck(rank != size - 1 || rend == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Local column sizes %" PetscInt_FMT " do not add up to total number of columns %" PetscInt_FMT, rend, n);
3796 
3797     /* next, compute all the lengths */
3798     PetscCall(PetscMalloc1(2 * m + 1, &dlens));
3799     olens = dlens + m;
3800     for (i = 0; i < m; i++) {
3801       jend = ii[i + 1] - ii[i];
3802       olen = 0;
3803       dlen = 0;
3804       for (j = 0; j < jend; j++) {
3805         if (*jj < rstart || *jj >= rend) olen++;
3806         else dlen++;
3807         jj++;
3808       }
3809       olens[i] = olen;
3810       dlens[i] = dlen;
3811     }
3812     PetscCall(MatCreate(comm, &M));
3813     PetscCall(MatSetSizes(M, m, nlocal, PETSC_DECIDE, n));
3814     PetscCall(MatSetBlockSizes(M, bs, cbs));
3815     PetscCall(MatSetType(M, ((PetscObject)mat)->type_name));
3816     PetscCall(MatMPIAIJSetPreallocation(M, 0, dlens, 0, olens));
3817     PetscCall(PetscFree(dlens));
3818   } else {
3819     PetscInt ml, nl;
3820 
3821     M = *newmat;
3822     PetscCall(MatGetLocalSize(M, &ml, &nl));
3823     PetscCheck(ml == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Previous matrix must be same size/layout as request");
3824     PetscCall(MatZeroEntries(M));
3825     /*
3826          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
3827        rather than the slower MatSetValues().
3828     */
3829     M->was_assembled = PETSC_TRUE;
3830     M->assembled     = PETSC_FALSE;
3831   }
3832   PetscCall(MatGetOwnershipRange(M, &rstart, &rend));
3833   aij = (Mat_SeqAIJ *)Mreuse->data;
3834   ii  = aij->i;
3835   jj  = aij->j;
3836 
3837   /* trigger copy to CPU if needed */
3838   PetscCall(MatSeqAIJGetArrayRead(Mreuse, (const PetscScalar **)&aa));
3839   for (i = 0; i < m; i++) {
3840     row   = rstart + i;
3841     nz    = ii[i + 1] - ii[i];
3842     cwork = jj;
3843     jj    = PetscSafePointerPlusOffset(jj, nz);
3844     vwork = aa;
3845     aa    = PetscSafePointerPlusOffset(aa, nz);
3846     PetscCall(MatSetValues_MPIAIJ(M, 1, &row, nz, cwork, vwork, INSERT_VALUES));
3847   }
3848   PetscCall(MatSeqAIJRestoreArrayRead(Mreuse, (const PetscScalar **)&aa));
3849 
3850   PetscCall(MatAssemblyBegin(M, MAT_FINAL_ASSEMBLY));
3851   PetscCall(MatAssemblyEnd(M, MAT_FINAL_ASSEMBLY));
3852   *newmat = M;
3853 
3854   /* save submatrix used in processor for next request */
3855   if (call == MAT_INITIAL_MATRIX) {
3856     PetscCall(PetscObjectCompose((PetscObject)M, "SubMatrix", (PetscObject)Mreuse));
3857     PetscCall(MatDestroy(&Mreuse));
3858   }
3859   PetscFunctionReturn(PETSC_SUCCESS);
3860 }
3861 
3862 static PetscErrorCode MatMPIAIJSetPreallocationCSR_MPIAIJ(Mat B, const PetscInt Ii[], const PetscInt J[], const PetscScalar v[])
3863 {
3864   PetscInt        m, cstart, cend, j, nnz, i, d, *ld;
3865   PetscInt       *d_nnz, *o_nnz, nnz_max = 0, rstart, ii, irstart;
3866   const PetscInt *JJ;
3867   PetscBool       nooffprocentries;
3868   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)B->data;
3869 
3870   PetscFunctionBegin;
3871   PetscCall(PetscLayoutSetUp(B->rmap));
3872   PetscCall(PetscLayoutSetUp(B->cmap));
3873   m       = B->rmap->n;
3874   cstart  = B->cmap->rstart;
3875   cend    = B->cmap->rend;
3876   rstart  = B->rmap->rstart;
3877   irstart = Ii[0];
3878 
3879   PetscCall(PetscCalloc2(m, &d_nnz, m, &o_nnz));
3880 
3881   if (PetscDefined(USE_DEBUG)) {
3882     for (i = 0; i < m; i++) {
3883       nnz = Ii[i + 1] - Ii[i];
3884       JJ  = PetscSafePointerPlusOffset(J, Ii[i] - irstart);
3885       PetscCheck(nnz >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Local row %" PetscInt_FMT " has a negative %" PetscInt_FMT " number of columns", i, nnz);
3886       PetscCheck(!nnz || !(JJ[0] < 0), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Row %" PetscInt_FMT " starts with negative column index %" PetscInt_FMT, i, JJ[0]);
3887       PetscCheck(!nnz || !(JJ[nnz - 1] >= B->cmap->N), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Row %" PetscInt_FMT " ends with too large a column index %" PetscInt_FMT " (max allowed %" PetscInt_FMT ")", i, JJ[nnz - 1], B->cmap->N);
3888     }
3889   }
3890 
3891   for (i = 0; i < m; i++) {
3892     nnz     = Ii[i + 1] - Ii[i];
3893     JJ      = PetscSafePointerPlusOffset(J, Ii[i] - irstart);
3894     nnz_max = PetscMax(nnz_max, nnz);
3895     d       = 0;
3896     for (j = 0; j < nnz; j++) {
3897       if (cstart <= JJ[j] && JJ[j] < cend) d++;
3898     }
3899     d_nnz[i] = d;
3900     o_nnz[i] = nnz - d;
3901   }
3902   PetscCall(MatMPIAIJSetPreallocation(B, 0, d_nnz, 0, o_nnz));
3903   PetscCall(PetscFree2(d_nnz, o_nnz));
3904 
3905   for (i = 0; i < m; i++) {
3906     ii = i + rstart;
3907     PetscCall(MatSetValues_MPIAIJ(B, 1, &ii, Ii[i + 1] - Ii[i], PetscSafePointerPlusOffset(J, Ii[i] - irstart), PetscSafePointerPlusOffset(v, Ii[i] - irstart), INSERT_VALUES));
3908   }
3909   nooffprocentries    = B->nooffprocentries;
3910   B->nooffprocentries = PETSC_TRUE;
3911   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
3912   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
3913   B->nooffprocentries = nooffprocentries;
3914 
3915   /* count number of entries below block diagonal */
3916   PetscCall(PetscFree(Aij->ld));
3917   PetscCall(PetscCalloc1(m, &ld));
3918   Aij->ld = ld;
3919   for (i = 0; i < m; i++) {
3920     nnz = Ii[i + 1] - Ii[i];
3921     j   = 0;
3922     while (j < nnz && J[j] < cstart) j++;
3923     ld[i] = j;
3924     if (J) J += nnz;
3925   }
3926 
3927   PetscCall(MatSetOption(B, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3928   PetscFunctionReturn(PETSC_SUCCESS);
3929 }
3930 
3931 /*@
3932   MatMPIAIJSetPreallocationCSR - Allocates memory for a sparse parallel matrix in `MATAIJ` format
3933   (the default parallel PETSc format).
3934 
3935   Collective
3936 
3937   Input Parameters:
3938 + B - the matrix
3939 . i - the indices into `j` for the start of each local row (indices start with zero)
3940 . j - the column indices for each local row (indices start with zero)
3941 - v - optional values in the matrix
3942 
3943   Level: developer
3944 
3945   Notes:
3946   The `i`, `j`, and `v` arrays ARE copied by this routine into the internal format used by PETSc;
3947   thus you CANNOT change the matrix entries by changing the values of `v` after you have
3948   called this routine. Use `MatCreateMPIAIJWithSplitArrays()` to avoid needing to copy the arrays.
3949 
3950   The `i` and `j` indices are 0 based, and `i` indices are indices corresponding to the local `j` array.
3951 
3952   A convenience routine for this functionality is `MatCreateMPIAIJWithArrays()`.
3953 
3954   You can update the matrix with new numerical values using `MatUpdateMPIAIJWithArrays()` after this call if the column indices in `j` are sorted.
3955 
3956   If you do **not** use `MatUpdateMPIAIJWithArrays()`, the column indices in `j` do not need to be sorted. If you will use
3957   `MatUpdateMPIAIJWithArrays()`, the column indices **must** be sorted.
3958 
3959   The format which is used for the sparse matrix input, is equivalent to a
3960   row-major ordering.. i.e for the following matrix, the input data expected is
3961   as shown
3962 .vb
3963         1 0 0
3964         2 0 3     P0
3965        -------
3966         4 5 6     P1
3967 
3968      Process0 [P0] rows_owned=[0,1]
3969         i =  {0,1,3}  [size = nrow+1  = 2+1]
3970         j =  {0,0,2}  [size = 3]
3971         v =  {1,2,3}  [size = 3]
3972 
3973      Process1 [P1] rows_owned=[2]
3974         i =  {0,3}    [size = nrow+1  = 1+1]
3975         j =  {0,1,2}  [size = 3]
3976         v =  {4,5,6}  [size = 3]
3977 .ve
3978 
3979 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatCreateAIJ()`,
3980           `MatCreateSeqAIJWithArrays()`, `MatCreateMPIAIJWithSplitArrays()`, `MatCreateMPIAIJWithArrays()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
3981 @*/
3982 PetscErrorCode MatMPIAIJSetPreallocationCSR(Mat B, const PetscInt i[], const PetscInt j[], const PetscScalar v[])
3983 {
3984   PetscFunctionBegin;
3985   PetscTryMethod(B, "MatMPIAIJSetPreallocationCSR_C", (Mat, const PetscInt[], const PetscInt[], const PetscScalar[]), (B, i, j, v));
3986   PetscFunctionReturn(PETSC_SUCCESS);
3987 }
3988 
3989 /*@
3990   MatMPIAIJSetPreallocation - Preallocates memory for a sparse parallel matrix in `MATMPIAIJ` format
3991   (the default parallel PETSc format).  For good matrix assembly performance
3992   the user should preallocate the matrix storage by setting the parameters
3993   `d_nz` (or `d_nnz`) and `o_nz` (or `o_nnz`).
3994 
3995   Collective
3996 
3997   Input Parameters:
3998 + B     - the matrix
3999 . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
4000            (same value is used for all local rows)
4001 . d_nnz - array containing the number of nonzeros in the various rows of the
4002            DIAGONAL portion of the local submatrix (possibly different for each row)
4003            or `NULL` (`PETSC_NULL_INTEGER` in Fortran), if `d_nz` is used to specify the nonzero structure.
4004            The size of this array is equal to the number of local rows, i.e 'm'.
4005            For matrices that will be factored, you must leave room for (and set)
4006            the diagonal entry even if it is zero.
4007 . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
4008            submatrix (same value is used for all local rows).
4009 - o_nnz - array containing the number of nonzeros in the various rows of the
4010            OFF-DIAGONAL portion of the local submatrix (possibly different for
4011            each row) or `NULL` (`PETSC_NULL_INTEGER` in Fortran), if `o_nz` is used to specify the nonzero
4012            structure. The size of this array is equal to the number
4013            of local rows, i.e 'm'.
4014 
4015   Example Usage:
4016   Consider the following 8x8 matrix with 34 non-zero values, that is
4017   assembled across 3 processors. Lets assume that proc0 owns 3 rows,
4018   proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
4019   as follows
4020 
4021 .vb
4022             1  2  0  |  0  3  0  |  0  4
4023     Proc0   0  5  6  |  7  0  0  |  8  0
4024             9  0 10  | 11  0  0  | 12  0
4025     -------------------------------------
4026            13  0 14  | 15 16 17  |  0  0
4027     Proc1   0 18  0  | 19 20 21  |  0  0
4028             0  0  0  | 22 23  0  | 24  0
4029     -------------------------------------
4030     Proc2  25 26 27  |  0  0 28  | 29  0
4031            30  0  0  | 31 32 33  |  0 34
4032 .ve
4033 
4034   This can be represented as a collection of submatrices as
4035 .vb
4036       A B C
4037       D E F
4038       G H I
4039 .ve
4040 
4041   Where the submatrices A,B,C are owned by proc0, D,E,F are
4042   owned by proc1, G,H,I are owned by proc2.
4043 
4044   The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4045   The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4046   The 'M','N' parameters are 8,8, and have the same values on all procs.
4047 
4048   The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
4049   submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
4050   corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
4051   Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
4052   part as `MATSEQAIJ` matrices. For example, proc1 will store [E] as a `MATSEQAIJ`
4053   matrix, and [DF] as another `MATSEQAIJ` matrix.
4054 
4055   When `d_nz`, `o_nz` parameters are specified, `d_nz` storage elements are
4056   allocated for every row of the local DIAGONAL submatrix, and `o_nz`
4057   storage locations are allocated for every row of the OFF-DIAGONAL submatrix.
4058   One way to choose `d_nz` and `o_nz` is to use the maximum number of nonzeros over
4059   the local rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
4060   In this case, the values of `d_nz`, `o_nz` are
4061 .vb
4062      proc0  dnz = 2, o_nz = 2
4063      proc1  dnz = 3, o_nz = 2
4064      proc2  dnz = 1, o_nz = 4
4065 .ve
4066   We are allocating `m`*(`d_nz`+`o_nz`) storage locations for every proc. This
4067   translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
4068   for proc3. i.e we are using 12+15+10=37 storage locations to store
4069   34 values.
4070 
4071   When `d_nnz`, `o_nnz` parameters are specified, the storage is specified
4072   for every row, corresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
4073   In the above case the values for `d_nnz`, `o_nnz` are
4074 .vb
4075      proc0 d_nnz = [2,2,2] and o_nnz = [2,2,2]
4076      proc1 d_nnz = [3,3,2] and o_nnz = [2,1,1]
4077      proc2 d_nnz = [1,1]   and o_nnz = [4,4]
4078 .ve
4079   Here the space allocated is sum of all the above values i.e 34, and
4080   hence pre-allocation is perfect.
4081 
4082   Level: intermediate
4083 
4084   Notes:
4085   If the *_nnz parameter is given then the *_nz parameter is ignored
4086 
4087   The `MATAIJ` format, also called compressed row storage (CSR), is compatible with standard Fortran
4088   storage.  The stored row and column indices begin with zero.
4089   See [Sparse Matrices](sec_matsparse) for details.
4090 
4091   The parallel matrix is partitioned such that the first m0 rows belong to
4092   process 0, the next m1 rows belong to process 1, the next m2 rows belong
4093   to process 2 etc.. where m0,m1,m2... are the input parameter 'm'.
4094 
4095   The DIAGONAL portion of the local submatrix of a processor can be defined
4096   as the submatrix which is obtained by extraction the part corresponding to
4097   the rows r1-r2 and columns c1-c2 of the global matrix, where r1 is the
4098   first row that belongs to the processor, r2 is the last row belonging to
4099   the this processor, and c1-c2 is range of indices of the local part of a
4100   vector suitable for applying the matrix to.  This is an mxn matrix.  In the
4101   common case of a square matrix, the row and column ranges are the same and
4102   the DIAGONAL part is also square. The remaining portion of the local
4103   submatrix (mxN) constitute the OFF-DIAGONAL portion.
4104 
4105   If `o_nnz` and `d_nnz` are specified, then `o_nz` and `d_nz` are ignored.
4106 
4107   You can call `MatGetInfo()` to get information on how effective the preallocation was;
4108   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
4109   You can also run with the option `-info` and look for messages with the string
4110   malloc in them to see if additional memory allocation was needed.
4111 
4112 .seealso: [](ch_matrices), `Mat`, [Sparse Matrices](sec_matsparse), `MATMPIAIJ`, `MATAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateAIJ()`, `MatMPIAIJSetPreallocationCSR()`,
4113           `MatGetInfo()`, `PetscSplitOwnership()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4114 @*/
4115 PetscErrorCode MatMPIAIJSetPreallocation(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
4116 {
4117   PetscFunctionBegin;
4118   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
4119   PetscValidType(B, 1);
4120   PetscTryMethod(B, "MatMPIAIJSetPreallocation_C", (Mat, PetscInt, const PetscInt[], PetscInt, const PetscInt[]), (B, d_nz, d_nnz, o_nz, o_nnz));
4121   PetscFunctionReturn(PETSC_SUCCESS);
4122 }
4123 
4124 /*@
4125   MatCreateMPIAIJWithArrays - creates a `MATMPIAIJ` matrix using arrays that contain in standard
4126   CSR format for the local rows.
4127 
4128   Collective
4129 
4130   Input Parameters:
4131 + comm - MPI communicator
4132 . m    - number of local rows (Cannot be `PETSC_DECIDE`)
4133 . n    - This value should be the same as the local size used in creating the
4134          x vector for the matrix-vector product $ y = Ax$. (or `PETSC_DECIDE` to have
4135          calculated if `N` is given) For square matrices n is almost always `m`.
4136 . M    - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
4137 . N    - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
4138 . i    - row indices (of length m+1); that is i[0] = 0, i[row] = i[row-1] + number of elements in that row of the matrix
4139 . j    - global column indices
4140 - a    - optional matrix values
4141 
4142   Output Parameter:
4143 . mat - the matrix
4144 
4145   Level: intermediate
4146 
4147   Notes:
4148   The `i`, `j`, and `a` arrays ARE copied by this routine into the internal format used by PETSc;
4149   thus you CANNOT change the matrix entries by changing the values of `a[]` after you have
4150   called this routine. Use `MatCreateMPIAIJWithSplitArrays()` to avoid needing to copy the arrays.
4151 
4152   The `i` and `j` indices are 0 based, and `i` indices are indices corresponding to the local `j` array.
4153 
4154   Once you have created the matrix you can update it with new numerical values using `MatUpdateMPIAIJWithArray()`
4155 
4156   If you do **not** use `MatUpdateMPIAIJWithArray()`, the column indices in `j` do not need to be sorted. If you will use
4157   `MatUpdateMPIAIJWithArrays()`, the column indices **must** be sorted.
4158 
4159   The format which is used for the sparse matrix input, is equivalent to a
4160   row-major ordering, i.e., for the following matrix, the input data expected is
4161   as shown
4162 .vb
4163         1 0 0
4164         2 0 3     P0
4165        -------
4166         4 5 6     P1
4167 
4168      Process0 [P0] rows_owned=[0,1]
4169         i =  {0,1,3}  [size = nrow+1  = 2+1]
4170         j =  {0,0,2}  [size = 3]
4171         v =  {1,2,3}  [size = 3]
4172 
4173      Process1 [P1] rows_owned=[2]
4174         i =  {0,3}    [size = nrow+1  = 1+1]
4175         j =  {0,1,2}  [size = 3]
4176         v =  {4,5,6}  [size = 3]
4177 .ve
4178 
4179 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4180           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArray()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4181 @*/
4182 PetscErrorCode MatCreateMPIAIJWithArrays(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, const PetscInt i[], const PetscInt j[], const PetscScalar a[], Mat *mat)
4183 {
4184   PetscFunctionBegin;
4185   PetscCheck(!i || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
4186   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
4187   PetscCall(MatCreate(comm, mat));
4188   PetscCall(MatSetSizes(*mat, m, n, M, N));
4189   /* PetscCall(MatSetBlockSizes(M,bs,cbs)); */
4190   PetscCall(MatSetType(*mat, MATMPIAIJ));
4191   PetscCall(MatMPIAIJSetPreallocationCSR(*mat, i, j, a));
4192   PetscFunctionReturn(PETSC_SUCCESS);
4193 }
4194 
4195 /*@
4196   MatUpdateMPIAIJWithArrays - updates a `MATMPIAIJ` matrix using arrays that contain in standard
4197   CSR format for the local rows. Only the numerical values are updated the other arrays must be identical to what was passed
4198   from `MatCreateMPIAIJWithArrays()`
4199 
4200   Deprecated: Use `MatUpdateMPIAIJWithArray()`
4201 
4202   Collective
4203 
4204   Input Parameters:
4205 + mat - the matrix
4206 . m   - number of local rows (Cannot be `PETSC_DECIDE`)
4207 . n   - This value should be the same as the local size used in creating the
4208        x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
4209        calculated if N is given) For square matrices n is almost always m.
4210 . M   - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
4211 . N   - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
4212 . Ii  - row indices; that is Ii[0] = 0, Ii[row] = Ii[row-1] + number of elements in that row of the matrix
4213 . J   - column indices
4214 - v   - matrix values
4215 
4216   Level: deprecated
4217 
4218 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4219           `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArray()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4220 @*/
4221 PetscErrorCode MatUpdateMPIAIJWithArrays(Mat mat, PetscInt m, PetscInt n, PetscInt M, PetscInt N, const PetscInt Ii[], const PetscInt J[], const PetscScalar v[])
4222 {
4223   PetscInt        nnz, i;
4224   PetscBool       nooffprocentries;
4225   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)mat->data;
4226   Mat_SeqAIJ     *Ad  = (Mat_SeqAIJ *)Aij->A->data;
4227   PetscScalar    *ad, *ao;
4228   PetscInt        ldi, Iii, md;
4229   const PetscInt *Adi = Ad->i;
4230   PetscInt       *ld  = Aij->ld;
4231 
4232   PetscFunctionBegin;
4233   PetscCheck(Ii[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
4234   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
4235   PetscCheck(m == mat->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Local number of rows cannot change from call to MatUpdateMPIAIJWithArrays()");
4236   PetscCheck(n == mat->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Local number of columns cannot change from call to MatUpdateMPIAIJWithArrays()");
4237 
4238   PetscCall(MatSeqAIJGetArrayWrite(Aij->A, &ad));
4239   PetscCall(MatSeqAIJGetArrayWrite(Aij->B, &ao));
4240 
4241   for (i = 0; i < m; i++) {
4242     if (PetscDefined(USE_DEBUG)) {
4243       for (PetscInt j = Ii[i] + 1; j < Ii[i + 1]; ++j) {
4244         PetscCheck(J[j] >= J[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column entry number %" PetscInt_FMT " (actual column %" PetscInt_FMT ") in row %" PetscInt_FMT " is not sorted", j - Ii[i], J[j], i);
4245         PetscCheck(J[j] != J[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column entry number %" PetscInt_FMT " (actual column %" PetscInt_FMT ") in row %" PetscInt_FMT " is identical to previous entry", j - Ii[i], J[j], i);
4246       }
4247     }
4248     nnz = Ii[i + 1] - Ii[i];
4249     Iii = Ii[i];
4250     ldi = ld[i];
4251     md  = Adi[i + 1] - Adi[i];
4252     PetscCall(PetscArraycpy(ao, v + Iii, ldi));
4253     PetscCall(PetscArraycpy(ad, v + Iii + ldi, md));
4254     PetscCall(PetscArraycpy(ao + ldi, v + Iii + ldi + md, nnz - ldi - md));
4255     ad += md;
4256     ao += nnz - md;
4257   }
4258   nooffprocentries      = mat->nooffprocentries;
4259   mat->nooffprocentries = PETSC_TRUE;
4260   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->A, &ad));
4261   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->B, &ao));
4262   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->A));
4263   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->B));
4264   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
4265   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
4266   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
4267   mat->nooffprocentries = nooffprocentries;
4268   PetscFunctionReturn(PETSC_SUCCESS);
4269 }
4270 
4271 /*@
4272   MatUpdateMPIAIJWithArray - updates an `MATMPIAIJ` matrix using an array that contains the nonzero values
4273 
4274   Collective
4275 
4276   Input Parameters:
4277 + mat - the matrix
4278 - v   - matrix values, stored by row
4279 
4280   Level: intermediate
4281 
4282   Notes:
4283   The matrix must have been obtained with `MatCreateMPIAIJWithArrays()` or `MatMPIAIJSetPreallocationCSR()`
4284 
4285   The column indices in the call to `MatCreateMPIAIJWithArrays()` or `MatMPIAIJSetPreallocationCSR()` must have been sorted for this call to work correctly
4286 
4287 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4288           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArrays()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4289 @*/
4290 PetscErrorCode MatUpdateMPIAIJWithArray(Mat mat, const PetscScalar v[])
4291 {
4292   PetscInt        nnz, i, m;
4293   PetscBool       nooffprocentries;
4294   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)mat->data;
4295   Mat_SeqAIJ     *Ad  = (Mat_SeqAIJ *)Aij->A->data;
4296   Mat_SeqAIJ     *Ao  = (Mat_SeqAIJ *)Aij->B->data;
4297   PetscScalar    *ad, *ao;
4298   const PetscInt *Adi = Ad->i, *Adj = Ao->i;
4299   PetscInt        ldi, Iii, md;
4300   PetscInt       *ld = Aij->ld;
4301 
4302   PetscFunctionBegin;
4303   m = mat->rmap->n;
4304 
4305   PetscCall(MatSeqAIJGetArrayWrite(Aij->A, &ad));
4306   PetscCall(MatSeqAIJGetArrayWrite(Aij->B, &ao));
4307   Iii = 0;
4308   for (i = 0; i < m; i++) {
4309     nnz = Adi[i + 1] - Adi[i] + Adj[i + 1] - Adj[i];
4310     ldi = ld[i];
4311     md  = Adi[i + 1] - Adi[i];
4312     PetscCall(PetscArraycpy(ad, v + Iii + ldi, md));
4313     ad += md;
4314     if (ao) {
4315       PetscCall(PetscArraycpy(ao, v + Iii, ldi));
4316       PetscCall(PetscArraycpy(ao + ldi, v + Iii + ldi + md, nnz - ldi - md));
4317       ao += nnz - md;
4318     }
4319     Iii += nnz;
4320   }
4321   nooffprocentries      = mat->nooffprocentries;
4322   mat->nooffprocentries = PETSC_TRUE;
4323   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->A, &ad));
4324   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->B, &ao));
4325   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->A));
4326   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->B));
4327   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
4328   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
4329   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
4330   mat->nooffprocentries = nooffprocentries;
4331   PetscFunctionReturn(PETSC_SUCCESS);
4332 }
4333 
4334 /*@
4335   MatCreateAIJ - Creates a sparse parallel matrix in `MATAIJ` format
4336   (the default parallel PETSc format).  For good matrix assembly performance
4337   the user should preallocate the matrix storage by setting the parameters
4338   `d_nz` (or `d_nnz`) and `o_nz` (or `o_nnz`).
4339 
4340   Collective
4341 
4342   Input Parameters:
4343 + comm  - MPI communicator
4344 . m     - number of local rows (or `PETSC_DECIDE` to have calculated if M is given)
4345           This value should be the same as the local size used in creating the
4346           y vector for the matrix-vector product y = Ax.
4347 . n     - This value should be the same as the local size used in creating the
4348           x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
4349           calculated if N is given) For square matrices n is almost always m.
4350 . M     - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
4351 . N     - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
4352 . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
4353           (same value is used for all local rows)
4354 . d_nnz - array containing the number of nonzeros in the various rows of the
4355           DIAGONAL portion of the local submatrix (possibly different for each row)
4356           or `NULL`, if `d_nz` is used to specify the nonzero structure.
4357           The size of this array is equal to the number of local rows, i.e 'm'.
4358 . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
4359           submatrix (same value is used for all local rows).
4360 - o_nnz - array containing the number of nonzeros in the various rows of the
4361           OFF-DIAGONAL portion of the local submatrix (possibly different for
4362           each row) or `NULL`, if `o_nz` is used to specify the nonzero
4363           structure. The size of this array is equal to the number
4364           of local rows, i.e 'm'.
4365 
4366   Output Parameter:
4367 . A - the matrix
4368 
4369   Options Database Keys:
4370 + -mat_no_inode                     - Do not use inodes
4371 . -mat_inode_limit <limit>          - Sets inode limit (max limit=5)
4372 - -matmult_vecscatter_view <viewer> - View the vecscatter (i.e., communication pattern) used in `MatMult()` of sparse parallel matrices.
4373                                       See viewer types in manual of `MatView()`. Of them, ascii_matlab, draw or binary cause the `VecScatter`
4374                                       to be viewed as a matrix. Entry (i,j) is the size of message (in bytes) rank i sends to rank j in one `MatMult()` call.
4375 
4376   Level: intermediate
4377 
4378   Notes:
4379   It is recommended that one use `MatCreateFromOptions()` or the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
4380   MatXXXXSetPreallocation() paradigm instead of this routine directly.
4381   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
4382 
4383   If the *_nnz parameter is given then the *_nz parameter is ignored
4384 
4385   The `m`,`n`,`M`,`N` parameters specify the size of the matrix, and its partitioning across
4386   processors, while `d_nz`,`d_nnz`,`o_nz`,`o_nnz` parameters specify the approximate
4387   storage requirements for this matrix.
4388 
4389   If `PETSC_DECIDE` or  `PETSC_DETERMINE` is used for a particular argument on one
4390   processor than it must be used on all processors that share the object for
4391   that argument.
4392 
4393   If `m` and `n` are not `PETSC_DECIDE`, then the values determine the `PetscLayout` of the matrix and the ranges returned by
4394   `MatGetOwnershipRange()`, `MatGetOwnershipRanges()`, `MatGetOwnershipRangeColumn()`, and `MatGetOwnershipRangesColumn()`.
4395 
4396   The user MUST specify either the local or global matrix dimensions
4397   (possibly both).
4398 
4399   The parallel matrix is partitioned across processors such that the
4400   first `m0` rows belong to process 0, the next `m1` rows belong to
4401   process 1, the next `m2` rows belong to process 2, etc., where
4402   `m0`, `m1`, `m2`... are the input parameter `m` on each MPI process. I.e., each MPI process stores
4403   values corresponding to [m x N] submatrix.
4404 
4405   The columns are logically partitioned with the n0 columns belonging
4406   to 0th partition, the next n1 columns belonging to the next
4407   partition etc.. where n0,n1,n2... are the input parameter 'n'.
4408 
4409   The DIAGONAL portion of the local submatrix on any given processor
4410   is the submatrix corresponding to the rows and columns m,n
4411   corresponding to the given processor. i.e diagonal matrix on
4412   process 0 is [m0 x n0], diagonal matrix on process 1 is [m1 x n1]
4413   etc. The remaining portion of the local submatrix [m x (N-n)]
4414   constitute the OFF-DIAGONAL portion. The example below better
4415   illustrates this concept. The two matrices, the DIAGONAL portion and
4416   the OFF-DIAGONAL portion are each stored as `MATSEQAIJ` matrices.
4417 
4418   For a square global matrix we define each processor's diagonal portion
4419   to be its local rows and the corresponding columns (a square submatrix);
4420   each processor's off-diagonal portion encompasses the remainder of the
4421   local matrix (a rectangular submatrix).
4422 
4423   If `o_nnz`, `d_nnz` are specified, then `o_nz`, and `d_nz` are ignored.
4424 
4425   When calling this routine with a single process communicator, a matrix of
4426   type `MATSEQAIJ` is returned.  If a matrix of type `MATMPIAIJ` is desired for this
4427   type of communicator, use the construction mechanism
4428 .vb
4429   MatCreate(..., &A);
4430   MatSetType(A, MATMPIAIJ);
4431   MatSetSizes(A, m, n, M, N);
4432   MatMPIAIJSetPreallocation(A, ...);
4433 .ve
4434 
4435   By default, this format uses inodes (identical nodes) when possible.
4436   We search for consecutive rows with the same nonzero structure, thereby
4437   reusing matrix information to achieve increased efficiency.
4438 
4439   Example Usage:
4440   Consider the following 8x8 matrix with 34 non-zero values, that is
4441   assembled across 3 processors. Lets assume that proc0 owns 3 rows,
4442   proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
4443   as follows
4444 
4445 .vb
4446             1  2  0  |  0  3  0  |  0  4
4447     Proc0   0  5  6  |  7  0  0  |  8  0
4448             9  0 10  | 11  0  0  | 12  0
4449     -------------------------------------
4450            13  0 14  | 15 16 17  |  0  0
4451     Proc1   0 18  0  | 19 20 21  |  0  0
4452             0  0  0  | 22 23  0  | 24  0
4453     -------------------------------------
4454     Proc2  25 26 27  |  0  0 28  | 29  0
4455            30  0  0  | 31 32 33  |  0 34
4456 .ve
4457 
4458   This can be represented as a collection of submatrices as
4459 
4460 .vb
4461       A B C
4462       D E F
4463       G H I
4464 .ve
4465 
4466   Where the submatrices A,B,C are owned by proc0, D,E,F are
4467   owned by proc1, G,H,I are owned by proc2.
4468 
4469   The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4470   The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4471   The 'M','N' parameters are 8,8, and have the same values on all procs.
4472 
4473   The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
4474   submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
4475   corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
4476   Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
4477   part as `MATSEQAIJ` matrices. For example, proc1 will store [E] as a `MATSEQAIJ`
4478   matrix, and [DF] as another SeqAIJ matrix.
4479 
4480   When `d_nz`, `o_nz` parameters are specified, `d_nz` storage elements are
4481   allocated for every row of the local DIAGONAL submatrix, and `o_nz`
4482   storage locations are allocated for every row of the OFF-DIAGONAL submatrix.
4483   One way to choose `d_nz` and `o_nz` is to use the maximum number of nonzeros over
4484   the local rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
4485   In this case, the values of `d_nz`,`o_nz` are
4486 .vb
4487      proc0  dnz = 2, o_nz = 2
4488      proc1  dnz = 3, o_nz = 2
4489      proc2  dnz = 1, o_nz = 4
4490 .ve
4491   We are allocating m*(`d_nz`+`o_nz`) storage locations for every proc. This
4492   translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
4493   for proc3. i.e we are using 12+15+10=37 storage locations to store
4494   34 values.
4495 
4496   When `d_nnz`, `o_nnz` parameters are specified, the storage is specified
4497   for every row, corresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
4498   In the above case the values for d_nnz,o_nnz are
4499 .vb
4500      proc0 d_nnz = [2,2,2] and o_nnz = [2,2,2]
4501      proc1 d_nnz = [3,3,2] and o_nnz = [2,1,1]
4502      proc2 d_nnz = [1,1]   and o_nnz = [4,4]
4503 .ve
4504   Here the space allocated is sum of all the above values i.e 34, and
4505   hence pre-allocation is perfect.
4506 
4507 .seealso: [](ch_matrices), `Mat`, [Sparse Matrix Creation](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4508           `MATMPIAIJ`, `MatCreateMPIAIJWithArrays()`, `MatGetOwnershipRange()`, `MatGetOwnershipRanges()`, `MatGetOwnershipRangeColumn()`,
4509           `MatGetOwnershipRangesColumn()`, `PetscLayout`
4510 @*/
4511 PetscErrorCode MatCreateAIJ(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
4512 {
4513   PetscMPIInt size;
4514 
4515   PetscFunctionBegin;
4516   PetscCall(MatCreate(comm, A));
4517   PetscCall(MatSetSizes(*A, m, n, M, N));
4518   PetscCallMPI(MPI_Comm_size(comm, &size));
4519   if (size > 1) {
4520     PetscCall(MatSetType(*A, MATMPIAIJ));
4521     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
4522   } else {
4523     PetscCall(MatSetType(*A, MATSEQAIJ));
4524     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
4525   }
4526   PetscFunctionReturn(PETSC_SUCCESS);
4527 }
4528 
4529 /*@C
4530   MatMPIAIJGetSeqAIJ - Returns the local pieces of this distributed matrix
4531 
4532   Not Collective
4533 
4534   Input Parameter:
4535 . A - The `MATMPIAIJ` matrix
4536 
4537   Output Parameters:
4538 + Ad     - The local diagonal block as a `MATSEQAIJ` matrix
4539 . Ao     - The local off-diagonal block as a `MATSEQAIJ` matrix
4540 - colmap - An array mapping local column numbers of `Ao` to global column numbers of the parallel matrix
4541 
4542   Level: intermediate
4543 
4544   Note:
4545   The rows in `Ad` and `Ao` are in [0, Nr), where Nr is the number of local rows on this process. The columns
4546   in `Ad` are in [0, Nc) where Nc is the number of local columns. The columns are `Ao` are in [0, Nco), where Nco is
4547   the number of nonzero columns in the local off-diagonal piece of the matrix `A`. The array colmap maps these
4548   local column numbers to global column numbers in the original matrix.
4549 
4550 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatMPIAIJGetLocalMat()`, `MatMPIAIJGetLocalMatCondensed()`, `MatCreateAIJ()`, `MATSEQAIJ`
4551 @*/
4552 PetscErrorCode MatMPIAIJGetSeqAIJ(Mat A, Mat *Ad, Mat *Ao, const PetscInt *colmap[])
4553 {
4554   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
4555   PetscBool   flg;
4556 
4557   PetscFunctionBegin;
4558   PetscCall(PetscStrbeginswith(((PetscObject)A)->type_name, MATMPIAIJ, &flg));
4559   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "This function requires a MATMPIAIJ matrix as input");
4560   if (Ad) *Ad = a->A;
4561   if (Ao) *Ao = a->B;
4562   if (colmap) *colmap = a->garray;
4563   PetscFunctionReturn(PETSC_SUCCESS);
4564 }
4565 
4566 PetscErrorCode MatCreateMPIMatConcatenateSeqMat_MPIAIJ(MPI_Comm comm, Mat inmat, PetscInt n, MatReuse scall, Mat *outmat)
4567 {
4568   PetscInt     m, N, i, rstart, nnz, Ii;
4569   PetscInt    *indx;
4570   PetscScalar *values;
4571   MatType      rootType;
4572 
4573   PetscFunctionBegin;
4574   PetscCall(MatGetSize(inmat, &m, &N));
4575   if (scall == MAT_INITIAL_MATRIX) { /* symbolic phase */
4576     PetscInt *dnz, *onz, sum, bs, cbs;
4577 
4578     if (n == PETSC_DECIDE) PetscCall(PetscSplitOwnership(comm, &n, &N));
4579     /* Check sum(n) = N */
4580     PetscCallMPI(MPIU_Allreduce(&n, &sum, 1, MPIU_INT, MPI_SUM, comm));
4581     PetscCheck(sum == N, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Sum of local columns %" PetscInt_FMT " != global columns %" PetscInt_FMT, sum, N);
4582 
4583     PetscCallMPI(MPI_Scan(&m, &rstart, 1, MPIU_INT, MPI_SUM, comm));
4584     rstart -= m;
4585 
4586     MatPreallocateBegin(comm, m, n, dnz, onz);
4587     for (i = 0; i < m; i++) {
4588       PetscCall(MatGetRow_SeqAIJ(inmat, i, &nnz, &indx, NULL));
4589       PetscCall(MatPreallocateSet(i + rstart, nnz, indx, dnz, onz));
4590       PetscCall(MatRestoreRow_SeqAIJ(inmat, i, &nnz, &indx, NULL));
4591     }
4592 
4593     PetscCall(MatCreate(comm, outmat));
4594     PetscCall(MatSetSizes(*outmat, m, n, PETSC_DETERMINE, PETSC_DETERMINE));
4595     PetscCall(MatGetBlockSizes(inmat, &bs, &cbs));
4596     PetscCall(MatSetBlockSizes(*outmat, bs, cbs));
4597     PetscCall(MatGetRootType_Private(inmat, &rootType));
4598     PetscCall(MatSetType(*outmat, rootType));
4599     PetscCall(MatSeqAIJSetPreallocation(*outmat, 0, dnz));
4600     PetscCall(MatMPIAIJSetPreallocation(*outmat, 0, dnz, 0, onz));
4601     MatPreallocateEnd(dnz, onz);
4602     PetscCall(MatSetOption(*outmat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
4603   }
4604 
4605   /* numeric phase */
4606   PetscCall(MatGetOwnershipRange(*outmat, &rstart, NULL));
4607   for (i = 0; i < m; i++) {
4608     PetscCall(MatGetRow_SeqAIJ(inmat, i, &nnz, &indx, &values));
4609     Ii = i + rstart;
4610     PetscCall(MatSetValues(*outmat, 1, &Ii, nnz, indx, values, INSERT_VALUES));
4611     PetscCall(MatRestoreRow_SeqAIJ(inmat, i, &nnz, &indx, &values));
4612   }
4613   PetscCall(MatAssemblyBegin(*outmat, MAT_FINAL_ASSEMBLY));
4614   PetscCall(MatAssemblyEnd(*outmat, MAT_FINAL_ASSEMBLY));
4615   PetscFunctionReturn(PETSC_SUCCESS);
4616 }
4617 
4618 static PetscErrorCode MatMergeSeqsToMPIDestroy(void **data)
4619 {
4620   MatMergeSeqsToMPI *merge = (MatMergeSeqsToMPI *)*data;
4621 
4622   PetscFunctionBegin;
4623   if (!merge) PetscFunctionReturn(PETSC_SUCCESS);
4624   PetscCall(PetscFree(merge->id_r));
4625   PetscCall(PetscFree(merge->len_s));
4626   PetscCall(PetscFree(merge->len_r));
4627   PetscCall(PetscFree(merge->bi));
4628   PetscCall(PetscFree(merge->bj));
4629   PetscCall(PetscFree(merge->buf_ri[0]));
4630   PetscCall(PetscFree(merge->buf_ri));
4631   PetscCall(PetscFree(merge->buf_rj[0]));
4632   PetscCall(PetscFree(merge->buf_rj));
4633   PetscCall(PetscFree(merge->coi));
4634   PetscCall(PetscFree(merge->coj));
4635   PetscCall(PetscFree(merge->owners_co));
4636   PetscCall(PetscLayoutDestroy(&merge->rowmap));
4637   PetscCall(PetscFree(merge));
4638   PetscFunctionReturn(PETSC_SUCCESS);
4639 }
4640 
4641 #include <../src/mat/utils/freespace.h>
4642 #include <petscbt.h>
4643 
4644 PetscErrorCode MatCreateMPIAIJSumSeqAIJNumeric(Mat seqmat, Mat mpimat)
4645 {
4646   MPI_Comm           comm;
4647   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)seqmat->data;
4648   PetscMPIInt        size, rank, taga, *len_s;
4649   PetscInt           N = mpimat->cmap->N, i, j, *owners, *ai = a->i, *aj, m;
4650   PetscMPIInt        proc, k;
4651   PetscInt         **buf_ri, **buf_rj;
4652   PetscInt           anzi, *bj_i, *bi, *bj, arow, bnzi, nextaj;
4653   PetscInt           nrows, **buf_ri_k, **nextrow, **nextai;
4654   MPI_Request       *s_waits, *r_waits;
4655   MPI_Status        *status;
4656   const MatScalar   *aa, *a_a;
4657   MatScalar        **abuf_r, *ba_i;
4658   MatMergeSeqsToMPI *merge;
4659   PetscContainer     container;
4660 
4661   PetscFunctionBegin;
4662   PetscCall(PetscObjectGetComm((PetscObject)mpimat, &comm));
4663   PetscCall(PetscLogEventBegin(MAT_Seqstompinum, seqmat, 0, 0, 0));
4664 
4665   PetscCallMPI(MPI_Comm_size(comm, &size));
4666   PetscCallMPI(MPI_Comm_rank(comm, &rank));
4667 
4668   PetscCall(PetscObjectQuery((PetscObject)mpimat, "MatMergeSeqsToMPI", (PetscObject *)&container));
4669   PetscCheck(container, PetscObjectComm((PetscObject)mpimat), PETSC_ERR_PLIB, "Mat not created from MatCreateMPIAIJSumSeqAIJSymbolic");
4670   PetscCall(PetscContainerGetPointer(container, (void **)&merge));
4671   PetscCall(MatSeqAIJGetArrayRead(seqmat, &a_a));
4672   aa = a_a;
4673 
4674   bi     = merge->bi;
4675   bj     = merge->bj;
4676   buf_ri = merge->buf_ri;
4677   buf_rj = merge->buf_rj;
4678 
4679   PetscCall(PetscMalloc1(size, &status));
4680   owners = merge->rowmap->range;
4681   len_s  = merge->len_s;
4682 
4683   /* send and recv matrix values */
4684   PetscCall(PetscObjectGetNewTag((PetscObject)mpimat, &taga));
4685   PetscCall(PetscPostIrecvScalar(comm, taga, merge->nrecv, merge->id_r, merge->len_r, &abuf_r, &r_waits));
4686 
4687   PetscCall(PetscMalloc1(merge->nsend + 1, &s_waits));
4688   for (proc = 0, k = 0; proc < size; proc++) {
4689     if (!len_s[proc]) continue;
4690     i = owners[proc];
4691     PetscCallMPI(MPIU_Isend(aa + ai[i], len_s[proc], MPIU_MATSCALAR, proc, taga, comm, s_waits + k));
4692     k++;
4693   }
4694 
4695   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, r_waits, status));
4696   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, s_waits, status));
4697   PetscCall(PetscFree(status));
4698 
4699   PetscCall(PetscFree(s_waits));
4700   PetscCall(PetscFree(r_waits));
4701 
4702   /* insert mat values of mpimat */
4703   PetscCall(PetscMalloc1(N, &ba_i));
4704   PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextai));
4705 
4706   for (k = 0; k < merge->nrecv; k++) {
4707     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
4708     nrows       = *buf_ri_k[k];
4709     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
4710     nextai[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
4711   }
4712 
4713   /* set values of ba */
4714   m = merge->rowmap->n;
4715   for (i = 0; i < m; i++) {
4716     arow = owners[rank] + i;
4717     bj_i = bj + bi[i]; /* col indices of the i-th row of mpimat */
4718     bnzi = bi[i + 1] - bi[i];
4719     PetscCall(PetscArrayzero(ba_i, bnzi));
4720 
4721     /* add local non-zero vals of this proc's seqmat into ba */
4722     anzi   = ai[arow + 1] - ai[arow];
4723     aj     = a->j + ai[arow];
4724     aa     = a_a + ai[arow];
4725     nextaj = 0;
4726     for (j = 0; nextaj < anzi; j++) {
4727       if (*(bj_i + j) == aj[nextaj]) { /* bcol == acol */
4728         ba_i[j] += aa[nextaj++];
4729       }
4730     }
4731 
4732     /* add received vals into ba */
4733     for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
4734       /* i-th row */
4735       if (i == *nextrow[k]) {
4736         anzi   = *(nextai[k] + 1) - *nextai[k];
4737         aj     = buf_rj[k] + *nextai[k];
4738         aa     = abuf_r[k] + *nextai[k];
4739         nextaj = 0;
4740         for (j = 0; nextaj < anzi; j++) {
4741           if (*(bj_i + j) == aj[nextaj]) { /* bcol == acol */
4742             ba_i[j] += aa[nextaj++];
4743           }
4744         }
4745         nextrow[k]++;
4746         nextai[k]++;
4747       }
4748     }
4749     PetscCall(MatSetValues(mpimat, 1, &arow, bnzi, bj_i, ba_i, INSERT_VALUES));
4750   }
4751   PetscCall(MatSeqAIJRestoreArrayRead(seqmat, &a_a));
4752   PetscCall(MatAssemblyBegin(mpimat, MAT_FINAL_ASSEMBLY));
4753   PetscCall(MatAssemblyEnd(mpimat, MAT_FINAL_ASSEMBLY));
4754 
4755   PetscCall(PetscFree(abuf_r[0]));
4756   PetscCall(PetscFree(abuf_r));
4757   PetscCall(PetscFree(ba_i));
4758   PetscCall(PetscFree3(buf_ri_k, nextrow, nextai));
4759   PetscCall(PetscLogEventEnd(MAT_Seqstompinum, seqmat, 0, 0, 0));
4760   PetscFunctionReturn(PETSC_SUCCESS);
4761 }
4762 
4763 PetscErrorCode MatCreateMPIAIJSumSeqAIJSymbolic(MPI_Comm comm, Mat seqmat, PetscInt m, PetscInt n, Mat *mpimat)
4764 {
4765   Mat                B_mpi;
4766   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)seqmat->data;
4767   PetscMPIInt        size, rank, tagi, tagj, *len_s, *len_si, *len_ri;
4768   PetscInt         **buf_rj, **buf_ri, **buf_ri_k;
4769   PetscInt           M = seqmat->rmap->n, N = seqmat->cmap->n, i, *owners, *ai = a->i, *aj = a->j;
4770   PetscInt           len, *dnz, *onz, bs, cbs;
4771   PetscInt           k, anzi, *bi, *bj, *lnk, nlnk, arow, bnzi;
4772   PetscInt           nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextai;
4773   MPI_Request       *si_waits, *sj_waits, *ri_waits, *rj_waits;
4774   MPI_Status        *status;
4775   PetscFreeSpaceList free_space = NULL, current_space = NULL;
4776   PetscBT            lnkbt;
4777   MatMergeSeqsToMPI *merge;
4778   PetscContainer     container;
4779 
4780   PetscFunctionBegin;
4781   PetscCall(PetscLogEventBegin(MAT_Seqstompisym, seqmat, 0, 0, 0));
4782 
4783   /* make sure it is a PETSc comm */
4784   PetscCall(PetscCommDuplicate(comm, &comm, NULL));
4785   PetscCallMPI(MPI_Comm_size(comm, &size));
4786   PetscCallMPI(MPI_Comm_rank(comm, &rank));
4787 
4788   PetscCall(PetscNew(&merge));
4789   PetscCall(PetscMalloc1(size, &status));
4790 
4791   /* determine row ownership */
4792   PetscCall(PetscLayoutCreate(comm, &merge->rowmap));
4793   PetscCall(PetscLayoutSetLocalSize(merge->rowmap, m));
4794   PetscCall(PetscLayoutSetSize(merge->rowmap, M));
4795   PetscCall(PetscLayoutSetBlockSize(merge->rowmap, 1));
4796   PetscCall(PetscLayoutSetUp(merge->rowmap));
4797   PetscCall(PetscMalloc1(size, &len_si));
4798   PetscCall(PetscMalloc1(size, &merge->len_s));
4799 
4800   m      = merge->rowmap->n;
4801   owners = merge->rowmap->range;
4802 
4803   /* determine the number of messages to send, their lengths */
4804   len_s = merge->len_s;
4805 
4806   len          = 0; /* length of buf_si[] */
4807   merge->nsend = 0;
4808   for (PetscMPIInt proc = 0; proc < size; proc++) {
4809     len_si[proc] = 0;
4810     if (proc == rank) {
4811       len_s[proc] = 0;
4812     } else {
4813       PetscCall(PetscMPIIntCast(owners[proc + 1] - owners[proc] + 1, &len_si[proc]));
4814       PetscCall(PetscMPIIntCast(ai[owners[proc + 1]] - ai[owners[proc]], &len_s[proc])); /* num of rows to be sent to [proc] */
4815     }
4816     if (len_s[proc]) {
4817       merge->nsend++;
4818       nrows = 0;
4819       for (i = owners[proc]; i < owners[proc + 1]; i++) {
4820         if (ai[i + 1] > ai[i]) nrows++;
4821       }
4822       PetscCall(PetscMPIIntCast(2 * (nrows + 1), &len_si[proc]));
4823       len += len_si[proc];
4824     }
4825   }
4826 
4827   /* determine the number and length of messages to receive for ij-structure */
4828   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &merge->nrecv));
4829   PetscCall(PetscGatherMessageLengths2(comm, merge->nsend, merge->nrecv, len_s, len_si, &merge->id_r, &merge->len_r, &len_ri));
4830 
4831   /* post the Irecv of j-structure */
4832   PetscCall(PetscCommGetNewTag(comm, &tagj));
4833   PetscCall(PetscPostIrecvInt(comm, tagj, merge->nrecv, merge->id_r, merge->len_r, &buf_rj, &rj_waits));
4834 
4835   /* post the Isend of j-structure */
4836   PetscCall(PetscMalloc2(merge->nsend, &si_waits, merge->nsend, &sj_waits));
4837 
4838   for (PetscMPIInt proc = 0, k = 0; proc < size; proc++) {
4839     if (!len_s[proc]) continue;
4840     i = owners[proc];
4841     PetscCallMPI(MPIU_Isend(aj + ai[i], len_s[proc], MPIU_INT, proc, tagj, comm, sj_waits + k));
4842     k++;
4843   }
4844 
4845   /* receives and sends of j-structure are complete */
4846   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, rj_waits, status));
4847   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, sj_waits, status));
4848 
4849   /* send and recv i-structure */
4850   PetscCall(PetscCommGetNewTag(comm, &tagi));
4851   PetscCall(PetscPostIrecvInt(comm, tagi, merge->nrecv, merge->id_r, len_ri, &buf_ri, &ri_waits));
4852 
4853   PetscCall(PetscMalloc1(len + 1, &buf_s));
4854   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
4855   for (PetscMPIInt proc = 0, k = 0; proc < size; proc++) {
4856     if (!len_s[proc]) continue;
4857     /* form outgoing message for i-structure:
4858          buf_si[0]:                 nrows to be sent
4859                [1:nrows]:           row index (global)
4860                [nrows+1:2*nrows+1]: i-structure index
4861     */
4862     nrows       = len_si[proc] / 2 - 1;
4863     buf_si_i    = buf_si + nrows + 1;
4864     buf_si[0]   = nrows;
4865     buf_si_i[0] = 0;
4866     nrows       = 0;
4867     for (i = owners[proc]; i < owners[proc + 1]; i++) {
4868       anzi = ai[i + 1] - ai[i];
4869       if (anzi) {
4870         buf_si_i[nrows + 1] = buf_si_i[nrows] + anzi; /* i-structure */
4871         buf_si[nrows + 1]   = i - owners[proc];       /* local row index */
4872         nrows++;
4873       }
4874     }
4875     PetscCallMPI(MPIU_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, si_waits + k));
4876     k++;
4877     buf_si += len_si[proc];
4878   }
4879 
4880   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, ri_waits, status));
4881   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, si_waits, status));
4882 
4883   PetscCall(PetscInfo(seqmat, "nsend: %d, nrecv: %d\n", merge->nsend, merge->nrecv));
4884   for (i = 0; i < merge->nrecv; i++) PetscCall(PetscInfo(seqmat, "recv len_ri=%d, len_rj=%d from [%d]\n", len_ri[i], merge->len_r[i], merge->id_r[i]));
4885 
4886   PetscCall(PetscFree(len_si));
4887   PetscCall(PetscFree(len_ri));
4888   PetscCall(PetscFree(rj_waits));
4889   PetscCall(PetscFree2(si_waits, sj_waits));
4890   PetscCall(PetscFree(ri_waits));
4891   PetscCall(PetscFree(buf_s));
4892   PetscCall(PetscFree(status));
4893 
4894   /* compute a local seq matrix in each processor */
4895   /* allocate bi array and free space for accumulating nonzero column info */
4896   PetscCall(PetscMalloc1(m + 1, &bi));
4897   bi[0] = 0;
4898 
4899   /* create and initialize a linked list */
4900   nlnk = N + 1;
4901   PetscCall(PetscLLCreate(N, N, nlnk, lnk, lnkbt));
4902 
4903   /* initial FreeSpace size is 2*(num of local nnz(seqmat)) */
4904   len = ai[owners[rank + 1]] - ai[owners[rank]];
4905   PetscCall(PetscFreeSpaceGet(PetscIntMultTruncate(2, len) + 1, &free_space));
4906 
4907   current_space = free_space;
4908 
4909   /* determine symbolic info for each local row */
4910   PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextai));
4911 
4912   for (k = 0; k < merge->nrecv; k++) {
4913     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
4914     nrows       = *buf_ri_k[k];
4915     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
4916     nextai[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
4917   }
4918 
4919   MatPreallocateBegin(comm, m, n, dnz, onz);
4920   len = 0;
4921   for (i = 0; i < m; i++) {
4922     bnzi = 0;
4923     /* add local non-zero cols of this proc's seqmat into lnk */
4924     arow = owners[rank] + i;
4925     anzi = ai[arow + 1] - ai[arow];
4926     aj   = a->j + ai[arow];
4927     PetscCall(PetscLLAddSorted(anzi, aj, N, &nlnk, lnk, lnkbt));
4928     bnzi += nlnk;
4929     /* add received col data into lnk */
4930     for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
4931       if (i == *nextrow[k]) {            /* i-th row */
4932         anzi = *(nextai[k] + 1) - *nextai[k];
4933         aj   = buf_rj[k] + *nextai[k];
4934         PetscCall(PetscLLAddSorted(anzi, aj, N, &nlnk, lnk, lnkbt));
4935         bnzi += nlnk;
4936         nextrow[k]++;
4937         nextai[k]++;
4938       }
4939     }
4940     if (len < bnzi) len = bnzi; /* =max(bnzi) */
4941 
4942     /* if free space is not available, make more free space */
4943     if (current_space->local_remaining < bnzi) PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(bnzi, current_space->total_array_size), &current_space));
4944     /* copy data into free space, then initialize lnk */
4945     PetscCall(PetscLLClean(N, N, bnzi, lnk, current_space->array, lnkbt));
4946     PetscCall(MatPreallocateSet(i + owners[rank], bnzi, current_space->array, dnz, onz));
4947 
4948     current_space->array += bnzi;
4949     current_space->local_used += bnzi;
4950     current_space->local_remaining -= bnzi;
4951 
4952     bi[i + 1] = bi[i] + bnzi;
4953   }
4954 
4955   PetscCall(PetscFree3(buf_ri_k, nextrow, nextai));
4956 
4957   PetscCall(PetscMalloc1(bi[m], &bj));
4958   PetscCall(PetscFreeSpaceContiguous(&free_space, bj));
4959   PetscCall(PetscLLDestroy(lnk, lnkbt));
4960 
4961   /* create symbolic parallel matrix B_mpi */
4962   PetscCall(MatGetBlockSizes(seqmat, &bs, &cbs));
4963   PetscCall(MatCreate(comm, &B_mpi));
4964   if (n == PETSC_DECIDE) {
4965     PetscCall(MatSetSizes(B_mpi, m, n, PETSC_DETERMINE, N));
4966   } else {
4967     PetscCall(MatSetSizes(B_mpi, m, n, PETSC_DETERMINE, PETSC_DETERMINE));
4968   }
4969   PetscCall(MatSetBlockSizes(B_mpi, bs, cbs));
4970   PetscCall(MatSetType(B_mpi, MATMPIAIJ));
4971   PetscCall(MatMPIAIJSetPreallocation(B_mpi, 0, dnz, 0, onz));
4972   MatPreallocateEnd(dnz, onz);
4973   PetscCall(MatSetOption(B_mpi, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE));
4974 
4975   /* B_mpi is not ready for use - assembly will be done by MatCreateMPIAIJSumSeqAIJNumeric() */
4976   B_mpi->assembled = PETSC_FALSE;
4977   merge->bi        = bi;
4978   merge->bj        = bj;
4979   merge->buf_ri    = buf_ri;
4980   merge->buf_rj    = buf_rj;
4981   merge->coi       = NULL;
4982   merge->coj       = NULL;
4983   merge->owners_co = NULL;
4984 
4985   PetscCall(PetscCommDestroy(&comm));
4986 
4987   /* attach the supporting struct to B_mpi for reuse */
4988   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container));
4989   PetscCall(PetscContainerSetPointer(container, merge));
4990   PetscCall(PetscContainerSetCtxDestroy(container, MatMergeSeqsToMPIDestroy));
4991   PetscCall(PetscObjectCompose((PetscObject)B_mpi, "MatMergeSeqsToMPI", (PetscObject)container));
4992   PetscCall(PetscContainerDestroy(&container));
4993   *mpimat = B_mpi;
4994 
4995   PetscCall(PetscLogEventEnd(MAT_Seqstompisym, seqmat, 0, 0, 0));
4996   PetscFunctionReturn(PETSC_SUCCESS);
4997 }
4998 
4999 /*@
5000   MatCreateMPIAIJSumSeqAIJ - Creates a `MATMPIAIJ` matrix by adding sequential
5001   matrices from each processor
5002 
5003   Collective
5004 
5005   Input Parameters:
5006 + comm   - the communicators the parallel matrix will live on
5007 . seqmat - the input sequential matrices
5008 . m      - number of local rows (or `PETSC_DECIDE`)
5009 . n      - number of local columns (or `PETSC_DECIDE`)
5010 - scall  - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5011 
5012   Output Parameter:
5013 . mpimat - the parallel matrix generated
5014 
5015   Level: advanced
5016 
5017   Note:
5018   The dimensions of the sequential matrix in each processor MUST be the same.
5019   The input seqmat is included into the container `MatMergeSeqsToMPIDestroy`, and will be
5020   destroyed when `mpimat` is destroyed. Call `PetscObjectQuery()` to access `seqmat`.
5021 
5022 .seealso: [](ch_matrices), `Mat`, `MatCreateAIJ()`
5023 @*/
5024 PetscErrorCode MatCreateMPIAIJSumSeqAIJ(MPI_Comm comm, Mat seqmat, PetscInt m, PetscInt n, MatReuse scall, Mat *mpimat)
5025 {
5026   PetscMPIInt size;
5027 
5028   PetscFunctionBegin;
5029   PetscCallMPI(MPI_Comm_size(comm, &size));
5030   if (size == 1) {
5031     PetscCall(PetscLogEventBegin(MAT_Seqstompi, seqmat, 0, 0, 0));
5032     if (scall == MAT_INITIAL_MATRIX) {
5033       PetscCall(MatDuplicate(seqmat, MAT_COPY_VALUES, mpimat));
5034     } else {
5035       PetscCall(MatCopy(seqmat, *mpimat, SAME_NONZERO_PATTERN));
5036     }
5037     PetscCall(PetscLogEventEnd(MAT_Seqstompi, seqmat, 0, 0, 0));
5038     PetscFunctionReturn(PETSC_SUCCESS);
5039   }
5040   PetscCall(PetscLogEventBegin(MAT_Seqstompi, seqmat, 0, 0, 0));
5041   if (scall == MAT_INITIAL_MATRIX) PetscCall(MatCreateMPIAIJSumSeqAIJSymbolic(comm, seqmat, m, n, mpimat));
5042   PetscCall(MatCreateMPIAIJSumSeqAIJNumeric(seqmat, *mpimat));
5043   PetscCall(PetscLogEventEnd(MAT_Seqstompi, seqmat, 0, 0, 0));
5044   PetscFunctionReturn(PETSC_SUCCESS);
5045 }
5046 
5047 /*@
5048   MatAIJGetLocalMat - Creates a `MATSEQAIJ` from a `MATAIJ` matrix.
5049 
5050   Not Collective
5051 
5052   Input Parameter:
5053 . A - the matrix
5054 
5055   Output Parameter:
5056 . A_loc - the local sequential matrix generated
5057 
5058   Level: developer
5059 
5060   Notes:
5061   The matrix is created by taking `A`'s local rows and putting them into a sequential matrix
5062   with `mlocal` rows and `n` columns. Where `mlocal` is obtained with `MatGetLocalSize()` and
5063   `n` is the global column count obtained with `MatGetSize()`
5064 
5065   In other words combines the two parts of a parallel `MATMPIAIJ` matrix on each process to a single matrix.
5066 
5067   For parallel matrices this creates an entirely new matrix. If the matrix is sequential it merely increases the reference count.
5068 
5069   Destroy the matrix with `MatDestroy()`
5070 
5071 .seealso: [](ch_matrices), `Mat`, `MatMPIAIJGetLocalMat()`
5072 @*/
5073 PetscErrorCode MatAIJGetLocalMat(Mat A, Mat *A_loc)
5074 {
5075   PetscBool mpi;
5076 
5077   PetscFunctionBegin;
5078   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIAIJ, &mpi));
5079   if (mpi) {
5080     PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, A_loc));
5081   } else {
5082     *A_loc = A;
5083     PetscCall(PetscObjectReference((PetscObject)*A_loc));
5084   }
5085   PetscFunctionReturn(PETSC_SUCCESS);
5086 }
5087 
5088 /*@
5089   MatMPIAIJGetLocalMat - Creates a `MATSEQAIJ` from a `MATMPIAIJ` matrix.
5090 
5091   Not Collective
5092 
5093   Input Parameters:
5094 + A     - the matrix
5095 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5096 
5097   Output Parameter:
5098 . A_loc - the local sequential matrix generated
5099 
5100   Level: developer
5101 
5102   Notes:
5103   The matrix is created by taking all `A`'s local rows and putting them into a sequential
5104   matrix with `mlocal` rows and `n` columns.`mlocal` is the row count obtained with
5105   `MatGetLocalSize()` and `n` is the global column count obtained with `MatGetSize()`.
5106 
5107   In other words combines the two parts of a parallel `MATMPIAIJ` matrix on each process to a single matrix.
5108 
5109   When `A` is sequential and `MAT_INITIAL_MATRIX` is requested, the matrix returned is the diagonal part of `A` (which contains the entire matrix),
5110   with its reference count increased by one. Hence changing values of `A_loc` changes `A`. If `MAT_REUSE_MATRIX` is requested on a sequential matrix
5111   then `MatCopy`(Adiag,*`A_loc`,`SAME_NONZERO_PATTERN`) is called to fill `A_loc`. Thus one can preallocate the appropriate sequential matrix `A_loc`
5112   and then call this routine with `MAT_REUSE_MATRIX`. In this case, one can modify the values of `A_loc` without affecting the original sequential matrix.
5113 
5114 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMatCondensed()`, `MatMPIAIJGetLocalMatMerge()`
5115 @*/
5116 PetscErrorCode MatMPIAIJGetLocalMat(Mat A, MatReuse scall, Mat *A_loc)
5117 {
5118   Mat_MPIAIJ        *mpimat = (Mat_MPIAIJ *)A->data;
5119   Mat_SeqAIJ        *mat, *a, *b;
5120   PetscInt          *ai, *aj, *bi, *bj, *cmap = mpimat->garray;
5121   const PetscScalar *aa, *ba, *aav, *bav;
5122   PetscScalar       *ca, *cam;
5123   PetscMPIInt        size;
5124   PetscInt           am = A->rmap->n, i, j, k, cstart = A->cmap->rstart;
5125   PetscInt          *ci, *cj, col, ncols_d, ncols_o, jo;
5126   PetscBool          match;
5127 
5128   PetscFunctionBegin;
5129   PetscCall(PetscStrbeginswith(((PetscObject)A)->type_name, MATMPIAIJ, &match));
5130   PetscCheck(match, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Requires MATMPIAIJ matrix as input");
5131   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
5132   if (size == 1) {
5133     if (scall == MAT_INITIAL_MATRIX) {
5134       PetscCall(PetscObjectReference((PetscObject)mpimat->A));
5135       *A_loc = mpimat->A;
5136     } else if (scall == MAT_REUSE_MATRIX) {
5137       PetscCall(MatCopy(mpimat->A, *A_loc, SAME_NONZERO_PATTERN));
5138     }
5139     PetscFunctionReturn(PETSC_SUCCESS);
5140   }
5141 
5142   PetscCall(PetscLogEventBegin(MAT_Getlocalmat, A, 0, 0, 0));
5143   a  = (Mat_SeqAIJ *)mpimat->A->data;
5144   b  = (Mat_SeqAIJ *)mpimat->B->data;
5145   ai = a->i;
5146   aj = a->j;
5147   bi = b->i;
5148   bj = b->j;
5149   PetscCall(MatSeqAIJGetArrayRead(mpimat->A, &aav));
5150   PetscCall(MatSeqAIJGetArrayRead(mpimat->B, &bav));
5151   aa = aav;
5152   ba = bav;
5153   if (scall == MAT_INITIAL_MATRIX) {
5154     PetscCall(PetscMalloc1(1 + am, &ci));
5155     ci[0] = 0;
5156     for (i = 0; i < am; i++) ci[i + 1] = ci[i] + (ai[i + 1] - ai[i]) + (bi[i + 1] - bi[i]);
5157     PetscCall(PetscMalloc1(1 + ci[am], &cj));
5158     PetscCall(PetscMalloc1(1 + ci[am], &ca));
5159     k = 0;
5160     for (i = 0; i < am; i++) {
5161       ncols_o = bi[i + 1] - bi[i];
5162       ncols_d = ai[i + 1] - ai[i];
5163       /* off-diagonal portion of A */
5164       for (jo = 0; jo < ncols_o; jo++) {
5165         col = cmap[*bj];
5166         if (col >= cstart) break;
5167         cj[k] = col;
5168         bj++;
5169         ca[k++] = *ba++;
5170       }
5171       /* diagonal portion of A */
5172       for (j = 0; j < ncols_d; j++) {
5173         cj[k]   = cstart + *aj++;
5174         ca[k++] = *aa++;
5175       }
5176       /* off-diagonal portion of A */
5177       for (j = jo; j < ncols_o; j++) {
5178         cj[k]   = cmap[*bj++];
5179         ca[k++] = *ba++;
5180       }
5181     }
5182     /* put together the new matrix */
5183     PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, A->cmap->N, ci, cj, ca, A_loc));
5184     /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5185     /* Since these are PETSc arrays, change flags to free them as necessary. */
5186     mat          = (Mat_SeqAIJ *)(*A_loc)->data;
5187     mat->free_a  = PETSC_TRUE;
5188     mat->free_ij = PETSC_TRUE;
5189     mat->nonew   = 0;
5190   } else if (scall == MAT_REUSE_MATRIX) {
5191     mat = (Mat_SeqAIJ *)(*A_loc)->data;
5192     ci  = mat->i;
5193     cj  = mat->j;
5194     PetscCall(MatSeqAIJGetArrayWrite(*A_loc, &cam));
5195     for (i = 0; i < am; i++) {
5196       /* off-diagonal portion of A */
5197       ncols_o = bi[i + 1] - bi[i];
5198       for (jo = 0; jo < ncols_o; jo++) {
5199         col = cmap[*bj];
5200         if (col >= cstart) break;
5201         *cam++ = *ba++;
5202         bj++;
5203       }
5204       /* diagonal portion of A */
5205       ncols_d = ai[i + 1] - ai[i];
5206       for (j = 0; j < ncols_d; j++) *cam++ = *aa++;
5207       /* off-diagonal portion of A */
5208       for (j = jo; j < ncols_o; j++) {
5209         *cam++ = *ba++;
5210         bj++;
5211       }
5212     }
5213     PetscCall(MatSeqAIJRestoreArrayWrite(*A_loc, &cam));
5214   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid MatReuse %d", (int)scall);
5215   PetscCall(MatSeqAIJRestoreArrayRead(mpimat->A, &aav));
5216   PetscCall(MatSeqAIJRestoreArrayRead(mpimat->B, &bav));
5217   PetscCall(PetscLogEventEnd(MAT_Getlocalmat, A, 0, 0, 0));
5218   PetscFunctionReturn(PETSC_SUCCESS);
5219 }
5220 
5221 /*@
5222   MatMPIAIJGetLocalMatMerge - Creates a `MATSEQAIJ` from a `MATMPIAIJ` matrix by taking all its local rows and putting them into a sequential matrix with
5223   mlocal rows and n columns. Where n is the sum of the number of columns of the diagonal and off-diagonal part
5224 
5225   Not Collective
5226 
5227   Input Parameters:
5228 + A     - the matrix
5229 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5230 
5231   Output Parameters:
5232 + glob  - sequential `IS` with global indices associated with the columns of the local sequential matrix generated (can be `NULL`)
5233 - A_loc - the local sequential matrix generated
5234 
5235   Level: developer
5236 
5237   Note:
5238   This is different from `MatMPIAIJGetLocalMat()` since the first columns in the returning matrix are those associated with the diagonal
5239   part, then those associated with the off-diagonal part (in its local ordering)
5240 
5241 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMat()`, `MatMPIAIJGetLocalMatCondensed()`
5242 @*/
5243 PetscErrorCode MatMPIAIJGetLocalMatMerge(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
5244 {
5245   Mat             Ao, Ad;
5246   const PetscInt *cmap;
5247   PetscMPIInt     size;
5248   PetscErrorCode (*f)(Mat, MatReuse, IS *, Mat *);
5249 
5250   PetscFunctionBegin;
5251   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
5252   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
5253   if (size == 1) {
5254     if (scall == MAT_INITIAL_MATRIX) {
5255       PetscCall(PetscObjectReference((PetscObject)Ad));
5256       *A_loc = Ad;
5257     } else if (scall == MAT_REUSE_MATRIX) {
5258       PetscCall(MatCopy(Ad, *A_loc, SAME_NONZERO_PATTERN));
5259     }
5260     if (glob) PetscCall(ISCreateStride(PetscObjectComm((PetscObject)Ad), Ad->cmap->n, Ad->cmap->rstart, 1, glob));
5261     PetscFunctionReturn(PETSC_SUCCESS);
5262   }
5263   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", &f));
5264   PetscCall(PetscLogEventBegin(MAT_Getlocalmat, A, 0, 0, 0));
5265   if (f) {
5266     PetscCall((*f)(A, scall, glob, A_loc));
5267   } else {
5268     Mat_SeqAIJ        *a = (Mat_SeqAIJ *)Ad->data;
5269     Mat_SeqAIJ        *b = (Mat_SeqAIJ *)Ao->data;
5270     Mat_SeqAIJ        *c;
5271     PetscInt          *ai = a->i, *aj = a->j;
5272     PetscInt          *bi = b->i, *bj = b->j;
5273     PetscInt          *ci, *cj;
5274     const PetscScalar *aa, *ba;
5275     PetscScalar       *ca;
5276     PetscInt           i, j, am, dn, on;
5277 
5278     PetscCall(MatGetLocalSize(Ad, &am, &dn));
5279     PetscCall(MatGetLocalSize(Ao, NULL, &on));
5280     PetscCall(MatSeqAIJGetArrayRead(Ad, &aa));
5281     PetscCall(MatSeqAIJGetArrayRead(Ao, &ba));
5282     if (scall == MAT_INITIAL_MATRIX) {
5283       PetscInt k;
5284       PetscCall(PetscMalloc1(1 + am, &ci));
5285       PetscCall(PetscMalloc1(ai[am] + bi[am], &cj));
5286       PetscCall(PetscMalloc1(ai[am] + bi[am], &ca));
5287       ci[0] = 0;
5288       for (i = 0, k = 0; i < am; i++) {
5289         const PetscInt ncols_o = bi[i + 1] - bi[i];
5290         const PetscInt ncols_d = ai[i + 1] - ai[i];
5291         ci[i + 1]              = ci[i] + ncols_o + ncols_d;
5292         /* diagonal portion of A */
5293         for (j = 0; j < ncols_d; j++, k++) {
5294           cj[k] = *aj++;
5295           ca[k] = *aa++;
5296         }
5297         /* off-diagonal portion of A */
5298         for (j = 0; j < ncols_o; j++, k++) {
5299           cj[k] = dn + *bj++;
5300           ca[k] = *ba++;
5301         }
5302       }
5303       /* put together the new matrix */
5304       PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, dn + on, ci, cj, ca, A_loc));
5305       /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5306       /* Since these are PETSc arrays, change flags to free them as necessary. */
5307       c          = (Mat_SeqAIJ *)(*A_loc)->data;
5308       c->free_a  = PETSC_TRUE;
5309       c->free_ij = PETSC_TRUE;
5310       c->nonew   = 0;
5311       PetscCall(MatSetType(*A_loc, ((PetscObject)Ad)->type_name));
5312     } else if (scall == MAT_REUSE_MATRIX) {
5313       PetscCall(MatSeqAIJGetArrayWrite(*A_loc, &ca));
5314       for (i = 0; i < am; i++) {
5315         const PetscInt ncols_d = ai[i + 1] - ai[i];
5316         const PetscInt ncols_o = bi[i + 1] - bi[i];
5317         /* diagonal portion of A */
5318         for (j = 0; j < ncols_d; j++) *ca++ = *aa++;
5319         /* off-diagonal portion of A */
5320         for (j = 0; j < ncols_o; j++) *ca++ = *ba++;
5321       }
5322       PetscCall(MatSeqAIJRestoreArrayWrite(*A_loc, &ca));
5323     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid MatReuse %d", (int)scall);
5324     PetscCall(MatSeqAIJRestoreArrayRead(Ad, &aa));
5325     PetscCall(MatSeqAIJRestoreArrayRead(Ao, &aa));
5326     if (glob) {
5327       PetscInt cst, *gidx;
5328 
5329       PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
5330       PetscCall(PetscMalloc1(dn + on, &gidx));
5331       for (i = 0; i < dn; i++) gidx[i] = cst + i;
5332       for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
5333       PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
5334     }
5335   }
5336   PetscCall(PetscLogEventEnd(MAT_Getlocalmat, A, 0, 0, 0));
5337   PetscFunctionReturn(PETSC_SUCCESS);
5338 }
5339 
5340 /*@C
5341   MatMPIAIJGetLocalMatCondensed - Creates a `MATSEQAIJ` matrix from an `MATMPIAIJ` matrix by taking all its local rows and NON-ZERO columns
5342 
5343   Not Collective
5344 
5345   Input Parameters:
5346 + A     - the matrix
5347 . scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5348 . row   - index set of rows to extract (or `NULL`)
5349 - col   - index set of columns to extract (or `NULL`)
5350 
5351   Output Parameter:
5352 . A_loc - the local sequential matrix generated
5353 
5354   Level: developer
5355 
5356 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMat()`
5357 @*/
5358 PetscErrorCode MatMPIAIJGetLocalMatCondensed(Mat A, MatReuse scall, IS *row, IS *col, Mat *A_loc)
5359 {
5360   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
5361   PetscInt    i, start, end, ncols, nzA, nzB, *cmap, imark, *idx;
5362   IS          isrowa, iscola;
5363   Mat        *aloc;
5364   PetscBool   match;
5365 
5366   PetscFunctionBegin;
5367   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIAIJ, &match));
5368   PetscCheck(match, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Requires MATMPIAIJ matrix as input");
5369   PetscCall(PetscLogEventBegin(MAT_Getlocalmatcondensed, A, 0, 0, 0));
5370   if (!row) {
5371     start = A->rmap->rstart;
5372     end   = A->rmap->rend;
5373     PetscCall(ISCreateStride(PETSC_COMM_SELF, end - start, start, 1, &isrowa));
5374   } else {
5375     isrowa = *row;
5376   }
5377   if (!col) {
5378     start = A->cmap->rstart;
5379     cmap  = a->garray;
5380     nzA   = a->A->cmap->n;
5381     nzB   = a->B->cmap->n;
5382     PetscCall(PetscMalloc1(nzA + nzB, &idx));
5383     ncols = 0;
5384     for (i = 0; i < nzB; i++) {
5385       if (cmap[i] < start) idx[ncols++] = cmap[i];
5386       else break;
5387     }
5388     imark = i;
5389     for (i = 0; i < nzA; i++) idx[ncols++] = start + i;
5390     for (i = imark; i < nzB; i++) idx[ncols++] = cmap[i];
5391     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, &iscola));
5392   } else {
5393     iscola = *col;
5394   }
5395   if (scall != MAT_INITIAL_MATRIX) {
5396     PetscCall(PetscMalloc1(1, &aloc));
5397     aloc[0] = *A_loc;
5398   }
5399   PetscCall(MatCreateSubMatrices(A, 1, &isrowa, &iscola, scall, &aloc));
5400   if (!col) { /* attach global id of condensed columns */
5401     PetscCall(PetscObjectCompose((PetscObject)aloc[0], "_petsc_GetLocalMatCondensed_iscol", (PetscObject)iscola));
5402   }
5403   *A_loc = aloc[0];
5404   PetscCall(PetscFree(aloc));
5405   if (!row) PetscCall(ISDestroy(&isrowa));
5406   if (!col) PetscCall(ISDestroy(&iscola));
5407   PetscCall(PetscLogEventEnd(MAT_Getlocalmatcondensed, A, 0, 0, 0));
5408   PetscFunctionReturn(PETSC_SUCCESS);
5409 }
5410 
5411 /*
5412  * Create a sequential AIJ matrix based on row indices. a whole column is extracted once a row is matched.
5413  * Row could be local or remote.The routine is designed to be scalable in memory so that nothing is based
5414  * on a global size.
5415  * */
5416 static PetscErrorCode MatCreateSeqSubMatrixWithRows_Private(Mat P, IS rows, Mat *P_oth)
5417 {
5418   Mat_MPIAIJ            *p  = (Mat_MPIAIJ *)P->data;
5419   Mat_SeqAIJ            *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data, *p_oth;
5420   PetscInt               plocalsize, nrows, *ilocal, *oilocal, i, lidx, *nrcols, *nlcols, ncol;
5421   PetscMPIInt            owner;
5422   PetscSFNode           *iremote, *oiremote;
5423   const PetscInt        *lrowindices;
5424   PetscSF                sf, osf;
5425   PetscInt               pcstart, *roffsets, *loffsets, *pnnz, j;
5426   PetscInt               ontotalcols, dntotalcols, ntotalcols, nout;
5427   MPI_Comm               comm;
5428   ISLocalToGlobalMapping mapping;
5429   const PetscScalar     *pd_a, *po_a;
5430 
5431   PetscFunctionBegin;
5432   PetscCall(PetscObjectGetComm((PetscObject)P, &comm));
5433   /* plocalsize is the number of roots
5434    * nrows is the number of leaves
5435    * */
5436   PetscCall(MatGetLocalSize(P, &plocalsize, NULL));
5437   PetscCall(ISGetLocalSize(rows, &nrows));
5438   PetscCall(PetscCalloc1(nrows, &iremote));
5439   PetscCall(ISGetIndices(rows, &lrowindices));
5440   for (i = 0; i < nrows; i++) {
5441     /* Find a remote index and an owner for a row
5442      * The row could be local or remote
5443      * */
5444     owner = 0;
5445     lidx  = 0;
5446     PetscCall(PetscLayoutFindOwnerIndex(P->rmap, lrowindices[i], &owner, &lidx));
5447     iremote[i].index = lidx;
5448     iremote[i].rank  = owner;
5449   }
5450   /* Create SF to communicate how many nonzero columns for each row */
5451   PetscCall(PetscSFCreate(comm, &sf));
5452   /* SF will figure out the number of nonzero columns for each row, and their
5453    * offsets
5454    * */
5455   PetscCall(PetscSFSetGraph(sf, plocalsize, nrows, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
5456   PetscCall(PetscSFSetFromOptions(sf));
5457   PetscCall(PetscSFSetUp(sf));
5458 
5459   PetscCall(PetscCalloc1(2 * (plocalsize + 1), &roffsets));
5460   PetscCall(PetscCalloc1(2 * plocalsize, &nrcols));
5461   PetscCall(PetscCalloc1(nrows, &pnnz));
5462   roffsets[0] = 0;
5463   roffsets[1] = 0;
5464   for (i = 0; i < plocalsize; i++) {
5465     /* diagonal */
5466     nrcols[i * 2 + 0] = pd->i[i + 1] - pd->i[i];
5467     /* off-diagonal */
5468     nrcols[i * 2 + 1] = po->i[i + 1] - po->i[i];
5469     /* compute offsets so that we relative location for each row */
5470     roffsets[(i + 1) * 2 + 0] = roffsets[i * 2 + 0] + nrcols[i * 2 + 0];
5471     roffsets[(i + 1) * 2 + 1] = roffsets[i * 2 + 1] + nrcols[i * 2 + 1];
5472   }
5473   PetscCall(PetscCalloc1(2 * nrows, &nlcols));
5474   PetscCall(PetscCalloc1(2 * nrows, &loffsets));
5475   /* 'r' means root, and 'l' means leaf */
5476   PetscCall(PetscSFBcastBegin(sf, MPIU_2INT, nrcols, nlcols, MPI_REPLACE));
5477   PetscCall(PetscSFBcastBegin(sf, MPIU_2INT, roffsets, loffsets, MPI_REPLACE));
5478   PetscCall(PetscSFBcastEnd(sf, MPIU_2INT, nrcols, nlcols, MPI_REPLACE));
5479   PetscCall(PetscSFBcastEnd(sf, MPIU_2INT, roffsets, loffsets, MPI_REPLACE));
5480   PetscCall(PetscSFDestroy(&sf));
5481   PetscCall(PetscFree(roffsets));
5482   PetscCall(PetscFree(nrcols));
5483   dntotalcols = 0;
5484   ontotalcols = 0;
5485   ncol        = 0;
5486   for (i = 0; i < nrows; i++) {
5487     pnnz[i] = nlcols[i * 2 + 0] + nlcols[i * 2 + 1];
5488     ncol    = PetscMax(pnnz[i], ncol);
5489     /* diagonal */
5490     dntotalcols += nlcols[i * 2 + 0];
5491     /* off-diagonal */
5492     ontotalcols += nlcols[i * 2 + 1];
5493   }
5494   /* We do not need to figure the right number of columns
5495    * since all the calculations will be done by going through the raw data
5496    * */
5497   PetscCall(MatCreateSeqAIJ(PETSC_COMM_SELF, nrows, ncol, 0, pnnz, P_oth));
5498   PetscCall(MatSetUp(*P_oth));
5499   PetscCall(PetscFree(pnnz));
5500   p_oth = (Mat_SeqAIJ *)(*P_oth)->data;
5501   /* diagonal */
5502   PetscCall(PetscCalloc1(dntotalcols, &iremote));
5503   /* off-diagonal */
5504   PetscCall(PetscCalloc1(ontotalcols, &oiremote));
5505   /* diagonal */
5506   PetscCall(PetscCalloc1(dntotalcols, &ilocal));
5507   /* off-diagonal */
5508   PetscCall(PetscCalloc1(ontotalcols, &oilocal));
5509   dntotalcols = 0;
5510   ontotalcols = 0;
5511   ntotalcols  = 0;
5512   for (i = 0; i < nrows; i++) {
5513     owner = 0;
5514     PetscCall(PetscLayoutFindOwnerIndex(P->rmap, lrowindices[i], &owner, NULL));
5515     /* Set iremote for diag matrix */
5516     for (j = 0; j < nlcols[i * 2 + 0]; j++) {
5517       iremote[dntotalcols].index = loffsets[i * 2 + 0] + j;
5518       iremote[dntotalcols].rank  = owner;
5519       /* P_oth is seqAIJ so that ilocal need to point to the first part of memory */
5520       ilocal[dntotalcols++] = ntotalcols++;
5521     }
5522     /* off-diagonal */
5523     for (j = 0; j < nlcols[i * 2 + 1]; j++) {
5524       oiremote[ontotalcols].index = loffsets[i * 2 + 1] + j;
5525       oiremote[ontotalcols].rank  = owner;
5526       oilocal[ontotalcols++]      = ntotalcols++;
5527     }
5528   }
5529   PetscCall(ISRestoreIndices(rows, &lrowindices));
5530   PetscCall(PetscFree(loffsets));
5531   PetscCall(PetscFree(nlcols));
5532   PetscCall(PetscSFCreate(comm, &sf));
5533   /* P serves as roots and P_oth is leaves
5534    * Diag matrix
5535    * */
5536   PetscCall(PetscSFSetGraph(sf, pd->i[plocalsize], dntotalcols, ilocal, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
5537   PetscCall(PetscSFSetFromOptions(sf));
5538   PetscCall(PetscSFSetUp(sf));
5539 
5540   PetscCall(PetscSFCreate(comm, &osf));
5541   /* off-diagonal */
5542   PetscCall(PetscSFSetGraph(osf, po->i[plocalsize], ontotalcols, oilocal, PETSC_OWN_POINTER, oiremote, PETSC_OWN_POINTER));
5543   PetscCall(PetscSFSetFromOptions(osf));
5544   PetscCall(PetscSFSetUp(osf));
5545   PetscCall(MatSeqAIJGetArrayRead(p->A, &pd_a));
5546   PetscCall(MatSeqAIJGetArrayRead(p->B, &po_a));
5547   /* operate on the matrix internal data to save memory */
5548   PetscCall(PetscSFBcastBegin(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5549   PetscCall(PetscSFBcastBegin(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5550   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, NULL));
5551   /* Convert to global indices for diag matrix */
5552   for (i = 0; i < pd->i[plocalsize]; i++) pd->j[i] += pcstart;
5553   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, pd->j, p_oth->j, MPI_REPLACE));
5554   /* We want P_oth store global indices */
5555   PetscCall(ISLocalToGlobalMappingCreate(comm, 1, p->B->cmap->n, p->garray, PETSC_COPY_VALUES, &mapping));
5556   /* Use memory scalable approach */
5557   PetscCall(ISLocalToGlobalMappingSetType(mapping, ISLOCALTOGLOBALMAPPINGHASH));
5558   PetscCall(ISLocalToGlobalMappingApply(mapping, po->i[plocalsize], po->j, po->j));
5559   PetscCall(PetscSFBcastBegin(osf, MPIU_INT, po->j, p_oth->j, MPI_REPLACE));
5560   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, pd->j, p_oth->j, MPI_REPLACE));
5561   /* Convert back to local indices */
5562   for (i = 0; i < pd->i[plocalsize]; i++) pd->j[i] -= pcstart;
5563   PetscCall(PetscSFBcastEnd(osf, MPIU_INT, po->j, p_oth->j, MPI_REPLACE));
5564   nout = 0;
5565   PetscCall(ISGlobalToLocalMappingApply(mapping, IS_GTOLM_DROP, po->i[plocalsize], po->j, &nout, po->j));
5566   PetscCheck(nout == po->i[plocalsize], comm, PETSC_ERR_ARG_INCOMP, "n %" PetscInt_FMT " does not equal to nout %" PetscInt_FMT " ", po->i[plocalsize], nout);
5567   PetscCall(ISLocalToGlobalMappingDestroy(&mapping));
5568   /* Exchange values */
5569   PetscCall(PetscSFBcastEnd(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5570   PetscCall(PetscSFBcastEnd(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5571   PetscCall(MatSeqAIJRestoreArrayRead(p->A, &pd_a));
5572   PetscCall(MatSeqAIJRestoreArrayRead(p->B, &po_a));
5573   /* Stop PETSc from shrinking memory */
5574   for (i = 0; i < nrows; i++) p_oth->ilen[i] = p_oth->imax[i];
5575   PetscCall(MatAssemblyBegin(*P_oth, MAT_FINAL_ASSEMBLY));
5576   PetscCall(MatAssemblyEnd(*P_oth, MAT_FINAL_ASSEMBLY));
5577   /* Attach PetscSF objects to P_oth so that we can reuse it later */
5578   PetscCall(PetscObjectCompose((PetscObject)*P_oth, "diagsf", (PetscObject)sf));
5579   PetscCall(PetscObjectCompose((PetscObject)*P_oth, "offdiagsf", (PetscObject)osf));
5580   PetscCall(PetscSFDestroy(&sf));
5581   PetscCall(PetscSFDestroy(&osf));
5582   PetscFunctionReturn(PETSC_SUCCESS);
5583 }
5584 
5585 /*
5586  * Creates a SeqAIJ matrix by taking rows of B that equal to nonzero columns of local A
5587  * This supports MPIAIJ and MAIJ
5588  * */
5589 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat A, Mat P, PetscInt dof, MatReuse reuse, Mat *P_oth)
5590 {
5591   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
5592   Mat_SeqAIJ *p_oth;
5593   IS          rows, map;
5594   PetscHMapI  hamp;
5595   PetscInt    i, htsize, *rowindices, off, *mapping, key, count;
5596   MPI_Comm    comm;
5597   PetscSF     sf, osf;
5598   PetscBool   has;
5599 
5600   PetscFunctionBegin;
5601   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
5602   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAocols, A, P, 0, 0));
5603   /* If it is the first time, create an index set of off-diag nonzero columns of A,
5604    *  and then create a submatrix (that often is an overlapping matrix)
5605    * */
5606   if (reuse == MAT_INITIAL_MATRIX) {
5607     /* Use a hash table to figure out unique keys */
5608     PetscCall(PetscHMapICreateWithSize(a->B->cmap->n, &hamp));
5609     PetscCall(PetscCalloc1(a->B->cmap->n, &mapping));
5610     count = 0;
5611     /* Assume that  a->g is sorted, otherwise the following does not make sense */
5612     for (i = 0; i < a->B->cmap->n; i++) {
5613       key = a->garray[i] / dof;
5614       PetscCall(PetscHMapIHas(hamp, key, &has));
5615       if (!has) {
5616         mapping[i] = count;
5617         PetscCall(PetscHMapISet(hamp, key, count++));
5618       } else {
5619         /* Current 'i' has the same value the previous step */
5620         mapping[i] = count - 1;
5621       }
5622     }
5623     PetscCall(ISCreateGeneral(comm, a->B->cmap->n, mapping, PETSC_OWN_POINTER, &map));
5624     PetscCall(PetscHMapIGetSize(hamp, &htsize));
5625     PetscCheck(htsize == count, comm, PETSC_ERR_ARG_INCOMP, " Size of hash map %" PetscInt_FMT " is inconsistent with count %" PetscInt_FMT, htsize, count);
5626     PetscCall(PetscCalloc1(htsize, &rowindices));
5627     off = 0;
5628     PetscCall(PetscHMapIGetKeys(hamp, &off, rowindices));
5629     PetscCall(PetscHMapIDestroy(&hamp));
5630     PetscCall(PetscSortInt(htsize, rowindices));
5631     PetscCall(ISCreateGeneral(comm, htsize, rowindices, PETSC_OWN_POINTER, &rows));
5632     /* In case, the matrix was already created but users want to recreate the matrix */
5633     PetscCall(MatDestroy(P_oth));
5634     PetscCall(MatCreateSeqSubMatrixWithRows_Private(P, rows, P_oth));
5635     PetscCall(PetscObjectCompose((PetscObject)*P_oth, "aoffdiagtopothmapping", (PetscObject)map));
5636     PetscCall(ISDestroy(&map));
5637     PetscCall(ISDestroy(&rows));
5638   } else if (reuse == MAT_REUSE_MATRIX) {
5639     /* If matrix was already created, we simply update values using SF objects
5640      * that as attached to the matrix earlier.
5641      */
5642     const PetscScalar *pd_a, *po_a;
5643 
5644     PetscCall(PetscObjectQuery((PetscObject)*P_oth, "diagsf", (PetscObject *)&sf));
5645     PetscCall(PetscObjectQuery((PetscObject)*P_oth, "offdiagsf", (PetscObject *)&osf));
5646     PetscCheck(sf && osf, comm, PETSC_ERR_ARG_NULL, "Matrix is not initialized yet");
5647     p_oth = (Mat_SeqAIJ *)(*P_oth)->data;
5648     /* Update values in place */
5649     PetscCall(MatSeqAIJGetArrayRead(p->A, &pd_a));
5650     PetscCall(MatSeqAIJGetArrayRead(p->B, &po_a));
5651     PetscCall(PetscSFBcastBegin(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5652     PetscCall(PetscSFBcastBegin(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5653     PetscCall(PetscSFBcastEnd(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5654     PetscCall(PetscSFBcastEnd(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5655     PetscCall(MatSeqAIJRestoreArrayRead(p->A, &pd_a));
5656     PetscCall(MatSeqAIJRestoreArrayRead(p->B, &po_a));
5657   } else SETERRQ(comm, PETSC_ERR_ARG_UNKNOWN_TYPE, "Unknown reuse type");
5658   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAocols, A, P, 0, 0));
5659   PetscFunctionReturn(PETSC_SUCCESS);
5660 }
5661 
5662 /*@C
5663   MatGetBrowsOfAcols - Returns `IS` that contain rows of `B` that equal to nonzero columns of local `A`
5664 
5665   Collective
5666 
5667   Input Parameters:
5668 + A     - the first matrix in `MATMPIAIJ` format
5669 . B     - the second matrix in `MATMPIAIJ` format
5670 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5671 
5672   Output Parameters:
5673 + rowb  - On input index sets of rows of B to extract (or `NULL`), modified on output
5674 . colb  - On input index sets of columns of B to extract (or `NULL`), modified on output
5675 - B_seq - the sequential matrix generated
5676 
5677   Level: developer
5678 
5679 .seealso: `Mat`, `MATMPIAIJ`, `IS`, `MatReuse`
5680 @*/
5681 PetscErrorCode MatGetBrowsOfAcols(Mat A, Mat B, MatReuse scall, IS *rowb, IS *colb, Mat *B_seq)
5682 {
5683   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
5684   PetscInt   *idx, i, start, ncols, nzA, nzB, *cmap, imark;
5685   IS          isrowb, iscolb;
5686   Mat        *bseq = NULL;
5687 
5688   PetscFunctionBegin;
5689   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
5690              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
5691   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAcols, A, B, 0, 0));
5692 
5693   if (scall == MAT_INITIAL_MATRIX) {
5694     start = A->cmap->rstart;
5695     cmap  = a->garray;
5696     nzA   = a->A->cmap->n;
5697     nzB   = a->B->cmap->n;
5698     PetscCall(PetscMalloc1(nzA + nzB, &idx));
5699     ncols = 0;
5700     for (i = 0; i < nzB; i++) { /* row < local row index */
5701       if (cmap[i] < start) idx[ncols++] = cmap[i];
5702       else break;
5703     }
5704     imark = i;
5705     for (i = 0; i < nzA; i++) idx[ncols++] = start + i;   /* local rows */
5706     for (i = imark; i < nzB; i++) idx[ncols++] = cmap[i]; /* row > local row index */
5707     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, &isrowb));
5708     PetscCall(ISCreateStride(PETSC_COMM_SELF, B->cmap->N, 0, 1, &iscolb));
5709   } else {
5710     PetscCheck(rowb && colb, PETSC_COMM_SELF, PETSC_ERR_SUP, "IS rowb and colb must be provided for MAT_REUSE_MATRIX");
5711     isrowb = *rowb;
5712     iscolb = *colb;
5713     PetscCall(PetscMalloc1(1, &bseq));
5714     bseq[0] = *B_seq;
5715   }
5716   PetscCall(MatCreateSubMatrices(B, 1, &isrowb, &iscolb, scall, &bseq));
5717   *B_seq = bseq[0];
5718   PetscCall(PetscFree(bseq));
5719   if (!rowb) {
5720     PetscCall(ISDestroy(&isrowb));
5721   } else {
5722     *rowb = isrowb;
5723   }
5724   if (!colb) {
5725     PetscCall(ISDestroy(&iscolb));
5726   } else {
5727     *colb = iscolb;
5728   }
5729   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAcols, A, B, 0, 0));
5730   PetscFunctionReturn(PETSC_SUCCESS);
5731 }
5732 
5733 /*
5734     MatGetBrowsOfAoCols_MPIAIJ - Creates a `MATSEQAIJ` matrix by taking rows of B that equal to nonzero columns
5735     of the OFF-DIAGONAL portion of local A
5736 
5737     Collective
5738 
5739    Input Parameters:
5740 +    A,B - the matrices in `MATMPIAIJ` format
5741 -    scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5742 
5743    Output Parameter:
5744 +    startsj_s - starting point in B's sending j-arrays, saved for MAT_REUSE (or NULL)
5745 .    startsj_r - starting point in B's receiving j-arrays, saved for MAT_REUSE (or NULL)
5746 .    bufa_ptr - array for sending matrix values, saved for MAT_REUSE (or NULL)
5747 -    B_oth - the sequential matrix generated with size aBn=a->B->cmap->n by B->cmap->N
5748 
5749     Developer Note:
5750     This directly accesses information inside the VecScatter associated with the matrix-vector product
5751      for this matrix. This is not desirable..
5752 
5753     Level: developer
5754 
5755 */
5756 
5757 PetscErrorCode MatGetBrowsOfAoCols_MPIAIJ(Mat A, Mat B, MatReuse scall, PetscInt **startsj_s, PetscInt **startsj_r, MatScalar **bufa_ptr, Mat *B_oth)
5758 {
5759   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data;
5760   VecScatter         ctx;
5761   MPI_Comm           comm;
5762   const PetscMPIInt *rprocs, *sprocs;
5763   PetscMPIInt        nrecvs, nsends;
5764   const PetscInt    *srow, *rstarts, *sstarts;
5765   PetscInt          *rowlen, *bufj, *bufJ, ncols = 0, aBn = a->B->cmap->n, row, *b_othi, *b_othj, *rvalues = NULL, *svalues = NULL, *cols, sbs, rbs;
5766   PetscInt           i, j, k = 0, l, ll, nrows, *rstartsj = NULL, *sstartsj, len;
5767   PetscScalar       *b_otha, *bufa, *bufA, *vals = NULL;
5768   MPI_Request       *reqs = NULL, *rwaits = NULL, *swaits = NULL;
5769   PetscMPIInt        size, tag, rank, nreqs;
5770 
5771   PetscFunctionBegin;
5772   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
5773   PetscCallMPI(MPI_Comm_size(comm, &size));
5774 
5775   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
5776              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
5777   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAocols, A, B, 0, 0));
5778   PetscCallMPI(MPI_Comm_rank(comm, &rank));
5779 
5780   if (size == 1) {
5781     startsj_s = NULL;
5782     bufa_ptr  = NULL;
5783     *B_oth    = NULL;
5784     PetscFunctionReturn(PETSC_SUCCESS);
5785   }
5786 
5787   ctx = a->Mvctx;
5788   tag = ((PetscObject)ctx)->tag;
5789 
5790   PetscCall(VecScatterGetRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &srow, &sprocs, &sbs));
5791   /* rprocs[] must be ordered so that indices received from them are ordered in rvalues[], which is key to algorithms used in this subroutine */
5792   PetscCall(VecScatterGetRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL /*indices not needed*/, &rprocs, &rbs));
5793   PetscCall(PetscMPIIntCast(nsends + nrecvs, &nreqs));
5794   PetscCall(PetscMalloc1(nreqs, &reqs));
5795   rwaits = reqs;
5796   swaits = PetscSafePointerPlusOffset(reqs, nrecvs);
5797 
5798   if (!startsj_s || !bufa_ptr) scall = MAT_INITIAL_MATRIX;
5799   if (scall == MAT_INITIAL_MATRIX) {
5800     /* i-array */
5801     /*  post receives */
5802     if (nrecvs) PetscCall(PetscMalloc1(rbs * (rstarts[nrecvs] - rstarts[0]), &rvalues)); /* rstarts can be NULL when nrecvs=0 */
5803     for (i = 0; i < nrecvs; i++) {
5804       rowlen = rvalues + rstarts[i] * rbs;
5805       nrows  = (rstarts[i + 1] - rstarts[i]) * rbs; /* num of indices to be received */
5806       PetscCallMPI(MPIU_Irecv(rowlen, nrows, MPIU_INT, rprocs[i], tag, comm, rwaits + i));
5807     }
5808 
5809     /* pack the outgoing message */
5810     PetscCall(PetscMalloc2(nsends + 1, &sstartsj, nrecvs + 1, &rstartsj));
5811 
5812     sstartsj[0] = 0;
5813     rstartsj[0] = 0;
5814     len         = 0; /* total length of j or a array to be sent */
5815     if (nsends) {
5816       k = sstarts[0]; /* ATTENTION: sstarts[0] and rstarts[0] are not necessarily zero */
5817       PetscCall(PetscMalloc1(sbs * (sstarts[nsends] - sstarts[0]), &svalues));
5818     }
5819     for (i = 0; i < nsends; i++) {
5820       rowlen = svalues + (sstarts[i] - sstarts[0]) * sbs;
5821       nrows  = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5822       for (j = 0; j < nrows; j++) {
5823         row = srow[k] + B->rmap->range[rank]; /* global row idx */
5824         for (l = 0; l < sbs; l++) {
5825           PetscCall(MatGetRow_MPIAIJ(B, row + l, &ncols, NULL, NULL)); /* rowlength */
5826 
5827           rowlen[j * sbs + l] = ncols;
5828 
5829           len += ncols;
5830           PetscCall(MatRestoreRow_MPIAIJ(B, row + l, &ncols, NULL, NULL));
5831         }
5832         k++;
5833       }
5834       PetscCallMPI(MPIU_Isend(rowlen, nrows * sbs, MPIU_INT, sprocs[i], tag, comm, swaits + i));
5835 
5836       sstartsj[i + 1] = len; /* starting point of (i+1)-th outgoing msg in bufj and bufa */
5837     }
5838     /* recvs and sends of i-array are completed */
5839     if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5840     PetscCall(PetscFree(svalues));
5841 
5842     /* allocate buffers for sending j and a arrays */
5843     PetscCall(PetscMalloc1(len, &bufj));
5844     PetscCall(PetscMalloc1(len, &bufa));
5845 
5846     /* create i-array of B_oth */
5847     PetscCall(PetscMalloc1(aBn + 1, &b_othi));
5848 
5849     b_othi[0] = 0;
5850     len       = 0; /* total length of j or a array to be received */
5851     k         = 0;
5852     for (i = 0; i < nrecvs; i++) {
5853       rowlen = rvalues + (rstarts[i] - rstarts[0]) * rbs;
5854       nrows  = (rstarts[i + 1] - rstarts[i]) * rbs; /* num of rows to be received */
5855       for (j = 0; j < nrows; j++) {
5856         b_othi[k + 1] = b_othi[k] + rowlen[j];
5857         PetscCall(PetscIntSumError(rowlen[j], len, &len));
5858         k++;
5859       }
5860       rstartsj[i + 1] = len; /* starting point of (i+1)-th incoming msg in bufj and bufa */
5861     }
5862     PetscCall(PetscFree(rvalues));
5863 
5864     /* allocate space for j and a arrays of B_oth */
5865     PetscCall(PetscMalloc1(b_othi[aBn], &b_othj));
5866     PetscCall(PetscMalloc1(b_othi[aBn], &b_otha));
5867 
5868     /* j-array */
5869     /*  post receives of j-array */
5870     for (i = 0; i < nrecvs; i++) {
5871       nrows = rstartsj[i + 1] - rstartsj[i]; /* length of the msg received */
5872       PetscCallMPI(MPIU_Irecv(PetscSafePointerPlusOffset(b_othj, rstartsj[i]), nrows, MPIU_INT, rprocs[i], tag, comm, rwaits + i));
5873     }
5874 
5875     /* pack the outgoing message j-array */
5876     if (nsends) k = sstarts[0];
5877     for (i = 0; i < nsends; i++) {
5878       nrows = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5879       bufJ  = PetscSafePointerPlusOffset(bufj, sstartsj[i]);
5880       for (j = 0; j < nrows; j++) {
5881         row = srow[k++] + B->rmap->range[rank]; /* global row idx */
5882         for (ll = 0; ll < sbs; ll++) {
5883           PetscCall(MatGetRow_MPIAIJ(B, row + ll, &ncols, &cols, NULL));
5884           for (l = 0; l < ncols; l++) *bufJ++ = cols[l];
5885           PetscCall(MatRestoreRow_MPIAIJ(B, row + ll, &ncols, &cols, NULL));
5886         }
5887       }
5888       PetscCallMPI(MPIU_Isend(PetscSafePointerPlusOffset(bufj, sstartsj[i]), sstartsj[i + 1] - sstartsj[i], MPIU_INT, sprocs[i], tag, comm, swaits + i));
5889     }
5890 
5891     /* recvs and sends of j-array are completed */
5892     if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5893   } else if (scall == MAT_REUSE_MATRIX) {
5894     sstartsj = *startsj_s;
5895     rstartsj = *startsj_r;
5896     bufa     = *bufa_ptr;
5897     PetscCall(MatSeqAIJGetArrayWrite(*B_oth, &b_otha));
5898   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not possess an object container");
5899 
5900   /* a-array */
5901   /*  post receives of a-array */
5902   for (i = 0; i < nrecvs; i++) {
5903     nrows = rstartsj[i + 1] - rstartsj[i]; /* length of the msg received */
5904     PetscCallMPI(MPIU_Irecv(PetscSafePointerPlusOffset(b_otha, rstartsj[i]), nrows, MPIU_SCALAR, rprocs[i], tag, comm, rwaits + i));
5905   }
5906 
5907   /* pack the outgoing message a-array */
5908   if (nsends) k = sstarts[0];
5909   for (i = 0; i < nsends; i++) {
5910     nrows = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5911     bufA  = PetscSafePointerPlusOffset(bufa, sstartsj[i]);
5912     for (j = 0; j < nrows; j++) {
5913       row = srow[k++] + B->rmap->range[rank]; /* global row idx */
5914       for (ll = 0; ll < sbs; ll++) {
5915         PetscCall(MatGetRow_MPIAIJ(B, row + ll, &ncols, NULL, &vals));
5916         for (l = 0; l < ncols; l++) *bufA++ = vals[l];
5917         PetscCall(MatRestoreRow_MPIAIJ(B, row + ll, &ncols, NULL, &vals));
5918       }
5919     }
5920     PetscCallMPI(MPIU_Isend(PetscSafePointerPlusOffset(bufa, sstartsj[i]), sstartsj[i + 1] - sstartsj[i], MPIU_SCALAR, sprocs[i], tag, comm, swaits + i));
5921   }
5922   /* recvs and sends of a-array are completed */
5923   if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5924   PetscCall(PetscFree(reqs));
5925 
5926   if (scall == MAT_INITIAL_MATRIX) {
5927     Mat_SeqAIJ *b_oth;
5928 
5929     /* put together the new matrix */
5930     PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, aBn, B->cmap->N, b_othi, b_othj, b_otha, B_oth));
5931 
5932     /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5933     /* Since these are PETSc arrays, change flags to free them as necessary. */
5934     b_oth          = (Mat_SeqAIJ *)(*B_oth)->data;
5935     b_oth->free_a  = PETSC_TRUE;
5936     b_oth->free_ij = PETSC_TRUE;
5937     b_oth->nonew   = 0;
5938 
5939     PetscCall(PetscFree(bufj));
5940     if (!startsj_s || !bufa_ptr) {
5941       PetscCall(PetscFree2(sstartsj, rstartsj));
5942       PetscCall(PetscFree(bufa_ptr));
5943     } else {
5944       *startsj_s = sstartsj;
5945       *startsj_r = rstartsj;
5946       *bufa_ptr  = bufa;
5947     }
5948   } else if (scall == MAT_REUSE_MATRIX) {
5949     PetscCall(MatSeqAIJRestoreArrayWrite(*B_oth, &b_otha));
5950   }
5951 
5952   PetscCall(VecScatterRestoreRemote_Private(ctx, PETSC_TRUE, &nsends, &sstarts, &srow, &sprocs, &sbs));
5953   PetscCall(VecScatterRestoreRemoteOrdered_Private(ctx, PETSC_FALSE, &nrecvs, &rstarts, NULL, &rprocs, &rbs));
5954   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAocols, A, B, 0, 0));
5955   PetscFunctionReturn(PETSC_SUCCESS);
5956 }
5957 
5958 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCRL(Mat, MatType, MatReuse, Mat *);
5959 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJPERM(Mat, MatType, MatReuse, Mat *);
5960 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJSELL(Mat, MatType, MatReuse, Mat *);
5961 #if defined(PETSC_HAVE_MKL_SPARSE)
5962 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJMKL(Mat, MatType, MatReuse, Mat *);
5963 #endif
5964 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIBAIJ(Mat, MatType, MatReuse, Mat *);
5965 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPISBAIJ(Mat, MatType, MatReuse, Mat *);
5966 #if defined(PETSC_HAVE_ELEMENTAL)
5967 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_Elemental(Mat, MatType, MatReuse, Mat *);
5968 #endif
5969 #if defined(PETSC_HAVE_SCALAPACK) && (defined(PETSC_USE_REAL_SINGLE) || defined(PETSC_USE_REAL_DOUBLE))
5970 PETSC_INTERN PetscErrorCode MatConvert_AIJ_ScaLAPACK(Mat, MatType, MatReuse, Mat *);
5971 #endif
5972 #if defined(PETSC_HAVE_HYPRE)
5973 PETSC_INTERN PetscErrorCode MatConvert_AIJ_HYPRE(Mat, MatType, MatReuse, Mat *);
5974 #endif
5975 #if defined(PETSC_HAVE_CUDA)
5976 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
5977 #endif
5978 #if defined(PETSC_HAVE_HIP)
5979 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
5980 #endif
5981 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
5982 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat, MatType, MatReuse, Mat *);
5983 #endif
5984 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPISELL(Mat, MatType, MatReuse, Mat *);
5985 PETSC_INTERN PetscErrorCode MatConvert_XAIJ_IS(Mat, MatType, MatReuse, Mat *);
5986 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_IS_XAIJ(Mat);
5987 
5988 /*
5989     Computes (B'*A')' since computing B*A directly is untenable
5990 
5991                n                       p                          p
5992         [             ]       [             ]         [                 ]
5993       m [      A      ]  *  n [       B     ]   =   m [         C       ]
5994         [             ]       [             ]         [                 ]
5995 
5996 */
5997 static PetscErrorCode MatMatMultNumeric_MPIDense_MPIAIJ(Mat A, Mat B, Mat C)
5998 {
5999   Mat At, Bt, Ct;
6000 
6001   PetscFunctionBegin;
6002   PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
6003   PetscCall(MatTranspose(B, MAT_INITIAL_MATRIX, &Bt));
6004   PetscCall(MatMatMult(Bt, At, MAT_INITIAL_MATRIX, PETSC_CURRENT, &Ct));
6005   PetscCall(MatDestroy(&At));
6006   PetscCall(MatDestroy(&Bt));
6007   PetscCall(MatTransposeSetPrecursor(Ct, C));
6008   PetscCall(MatTranspose(Ct, MAT_REUSE_MATRIX, &C));
6009   PetscCall(MatDestroy(&Ct));
6010   PetscFunctionReturn(PETSC_SUCCESS);
6011 }
6012 
6013 static PetscErrorCode MatMatMultSymbolic_MPIDense_MPIAIJ(Mat A, Mat B, PetscReal fill, Mat C)
6014 {
6015   PetscBool cisdense;
6016 
6017   PetscFunctionBegin;
6018   PetscCheck(A->cmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "A->cmap->n %" PetscInt_FMT " != B->rmap->n %" PetscInt_FMT, A->cmap->n, B->rmap->n);
6019   PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
6020   PetscCall(MatSetBlockSizesFromMats(C, A, B));
6021   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATMPIDENSE, MATMPIDENSECUDA, MATMPIDENSEHIP, ""));
6022   if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
6023   PetscCall(MatSetUp(C));
6024 
6025   C->ops->matmultnumeric = MatMatMultNumeric_MPIDense_MPIAIJ;
6026   PetscFunctionReturn(PETSC_SUCCESS);
6027 }
6028 
6029 static PetscErrorCode MatProductSetFromOptions_MPIDense_MPIAIJ_AB(Mat C)
6030 {
6031   Mat_Product *product = C->product;
6032   Mat          A = product->A, B = product->B;
6033 
6034   PetscFunctionBegin;
6035   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
6036              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
6037   C->ops->matmultsymbolic = MatMatMultSymbolic_MPIDense_MPIAIJ;
6038   C->ops->productsymbolic = MatProductSymbolic_AB;
6039   PetscFunctionReturn(PETSC_SUCCESS);
6040 }
6041 
6042 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIDense_MPIAIJ(Mat C)
6043 {
6044   Mat_Product *product = C->product;
6045 
6046   PetscFunctionBegin;
6047   if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_MPIDense_MPIAIJ_AB(C));
6048   PetscFunctionReturn(PETSC_SUCCESS);
6049 }
6050 
6051 /*
6052    Merge two sets of sorted nonzeros and return a CSR for the merged (sequential) matrix
6053 
6054   Input Parameters:
6055 
6056     j1,rowBegin1,rowEnd1,jmap1: describe the first set of nonzeros (Set1)
6057     j2,rowBegin2,rowEnd2,jmap2: describe the second set of nonzeros (Set2)
6058 
6059     mat: both sets' nonzeros are on m rows, where m is the number of local rows of the matrix mat
6060 
6061     For Set1, j1[] contains column indices of the nonzeros.
6062     For the k-th row (0<=k<m), [rowBegin1[k],rowEnd1[k]) index into j1[] and point to the begin/end nonzero in row k
6063     respectively (note rowEnd1[k] is not necessarily equal to rwoBegin1[k+1]). Indices in this range of j1[] are sorted,
6064     but might have repeats. jmap1[t+1] - jmap1[t] is the number of repeats for the t-th unique nonzero in Set1.
6065 
6066     Similar for Set2.
6067 
6068     This routine merges the two sets of nonzeros row by row and removes repeats.
6069 
6070   Output Parameters: (memory is allocated by the caller)
6071 
6072     i[],j[]: the CSR of the merged matrix, which has m rows.
6073     imap1[]: the k-th unique nonzero in Set1 (k=0,1,...) corresponds to imap1[k]-th unique nonzero in the merged matrix.
6074     imap2[]: similar to imap1[], but for Set2.
6075     Note we order nonzeros row-by-row and from left to right.
6076 */
6077 static PetscErrorCode MatMergeEntries_Internal(Mat mat, const PetscInt j1[], const PetscInt j2[], const PetscCount rowBegin1[], const PetscCount rowEnd1[], const PetscCount rowBegin2[], const PetscCount rowEnd2[], const PetscCount jmap1[], const PetscCount jmap2[], PetscCount imap1[], PetscCount imap2[], PetscInt i[], PetscInt j[])
6078 {
6079   PetscInt   r, m; /* Row index of mat */
6080   PetscCount t, t1, t2, b1, e1, b2, e2;
6081 
6082   PetscFunctionBegin;
6083   PetscCall(MatGetLocalSize(mat, &m, NULL));
6084   t1 = t2 = t = 0; /* Count unique nonzeros of in Set1, Set1 and the merged respectively */
6085   i[0]        = 0;
6086   for (r = 0; r < m; r++) { /* Do row by row merging */
6087     b1 = rowBegin1[r];
6088     e1 = rowEnd1[r];
6089     b2 = rowBegin2[r];
6090     e2 = rowEnd2[r];
6091     while (b1 < e1 && b2 < e2) {
6092       if (j1[b1] == j2[b2]) { /* Same column index and hence same nonzero */
6093         j[t]      = j1[b1];
6094         imap1[t1] = t;
6095         imap2[t2] = t;
6096         b1 += jmap1[t1 + 1] - jmap1[t1]; /* Jump to next unique local nonzero */
6097         b2 += jmap2[t2 + 1] - jmap2[t2]; /* Jump to next unique remote nonzero */
6098         t1++;
6099         t2++;
6100         t++;
6101       } else if (j1[b1] < j2[b2]) {
6102         j[t]      = j1[b1];
6103         imap1[t1] = t;
6104         b1 += jmap1[t1 + 1] - jmap1[t1];
6105         t1++;
6106         t++;
6107       } else {
6108         j[t]      = j2[b2];
6109         imap2[t2] = t;
6110         b2 += jmap2[t2 + 1] - jmap2[t2];
6111         t2++;
6112         t++;
6113       }
6114     }
6115     /* Merge the remaining in either j1[] or j2[] */
6116     while (b1 < e1) {
6117       j[t]      = j1[b1];
6118       imap1[t1] = t;
6119       b1 += jmap1[t1 + 1] - jmap1[t1];
6120       t1++;
6121       t++;
6122     }
6123     while (b2 < e2) {
6124       j[t]      = j2[b2];
6125       imap2[t2] = t;
6126       b2 += jmap2[t2 + 1] - jmap2[t2];
6127       t2++;
6128       t++;
6129     }
6130     PetscCall(PetscIntCast(t, i + r + 1));
6131   }
6132   PetscFunctionReturn(PETSC_SUCCESS);
6133 }
6134 
6135 /*
6136   Split nonzeros in a block of local rows into two subsets: those in the diagonal block and those in the off-diagonal block
6137 
6138   Input Parameters:
6139     mat: an MPI matrix that provides row and column layout information for splitting. Let's say its number of local rows is m.
6140     n,i[],j[],perm[]: there are n input entries, belonging to m rows. Row/col indices of the entries are stored in i[] and j[]
6141       respectively, along with a permutation array perm[]. Length of the i[],j[],perm[] arrays is n.
6142 
6143       i[] is already sorted, but within a row, j[] is not sorted and might have repeats.
6144       i[] might contain negative indices at the beginning, which means the corresponding entries should be ignored in the splitting.
6145 
6146   Output Parameters:
6147     j[],perm[]: the routine needs to sort j[] within each row along with perm[].
6148     rowBegin[],rowMid[],rowEnd[]: of length m, and the memory is preallocated and zeroed by the caller.
6149       They contain indices pointing to j[]. For 0<=r<m, [rowBegin[r],rowMid[r]) point to begin/end entries of row r of the diagonal block,
6150       and [rowMid[r],rowEnd[r]) point to begin/end entries of row r of the off-diagonal block.
6151 
6152     Aperm[],Ajmap[],Atot,Annz: Arrays are allocated by this routine.
6153       Atot: number of entries belonging to the diagonal block.
6154       Annz: number of unique nonzeros belonging to the diagonal block.
6155       Aperm[Atot] stores values from perm[] for entries belonging to the diagonal block. Length of Aperm[] is Atot, though it may also count
6156         repeats (i.e., same 'i,j' pair).
6157       Ajmap[Annz+1] stores the number of repeats of each unique entry belonging to the diagonal block. More precisely, Ajmap[t+1] - Ajmap[t]
6158         is the number of repeats for the t-th unique entry in the diagonal block. Ajmap[0] is always 0.
6159 
6160       Atot: number of entries belonging to the diagonal block
6161       Annz: number of unique nonzeros belonging to the diagonal block.
6162 
6163     Bperm[], Bjmap[], Btot, Bnnz are similar but for the off-diagonal block.
6164 
6165     Aperm[],Bperm[],Ajmap[] and Bjmap[] are allocated separately by this routine with PetscMalloc1().
6166 */
6167 static PetscErrorCode MatSplitEntries_Internal(Mat mat, PetscCount n, const PetscInt i[], PetscInt j[], PetscCount perm[], PetscCount rowBegin[], PetscCount rowMid[], PetscCount rowEnd[], PetscCount *Atot_, PetscCount **Aperm_, PetscCount *Annz_, PetscCount **Ajmap_, PetscCount *Btot_, PetscCount **Bperm_, PetscCount *Bnnz_, PetscCount **Bjmap_)
6168 {
6169   PetscInt    cstart, cend, rstart, rend, row, col;
6170   PetscCount  Atot = 0, Btot = 0; /* Total number of nonzeros in the diagonal and off-diagonal blocks */
6171   PetscCount  Annz = 0, Bnnz = 0; /* Number of unique nonzeros in the diagonal and off-diagonal blocks */
6172   PetscCount  k, m, p, q, r, s, mid;
6173   PetscCount *Aperm, *Bperm, *Ajmap, *Bjmap;
6174 
6175   PetscFunctionBegin;
6176   PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
6177   PetscCall(PetscLayoutGetRange(mat->cmap, &cstart, &cend));
6178   m = rend - rstart;
6179 
6180   /* Skip negative rows */
6181   for (k = 0; k < n; k++)
6182     if (i[k] >= 0) break;
6183 
6184   /* Process [k,n): sort and partition each local row into diag and offdiag portions,
6185      fill rowBegin[], rowMid[], rowEnd[], and count Atot, Btot, Annz, Bnnz.
6186   */
6187   while (k < n) {
6188     row = i[k];
6189     /* Entries in [k,s) are in one row. Shift diagonal block col indices so that diag is ahead of offdiag after sorting the row */
6190     for (s = k; s < n; s++)
6191       if (i[s] != row) break;
6192 
6193     /* Shift diag columns to range of [-PETSC_INT_MAX, -1] */
6194     for (p = k; p < s; p++) {
6195       if (j[p] >= cstart && j[p] < cend) j[p] -= PETSC_INT_MAX;
6196     }
6197     PetscCall(PetscSortIntWithCountArray(s - k, j + k, perm + k));
6198     PetscCall(PetscSortedIntUpperBound(j, k, s, -1, &mid)); /* Separate [k,s) into [k,mid) for diag and [mid,s) for offdiag */
6199     rowBegin[row - rstart] = k;
6200     rowMid[row - rstart]   = mid;
6201     rowEnd[row - rstart]   = s;
6202     PetscCheck(k == s || j[s - 1] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column index %" PetscInt_FMT " is >= matrix column size %" PetscInt_FMT, j[s - 1], mat->cmap->N);
6203 
6204     /* Count nonzeros of this diag/offdiag row, which might have repeats */
6205     Atot += mid - k;
6206     Btot += s - mid;
6207 
6208     /* Count unique nonzeros of this diag row */
6209     for (p = k; p < mid;) {
6210       col = j[p];
6211       do {
6212         j[p] += PETSC_INT_MAX; /* Revert the modified diagonal indices */
6213         p++;
6214       } while (p < mid && j[p] == col);
6215       Annz++;
6216     }
6217 
6218     /* Count unique nonzeros of this offdiag row */
6219     for (p = mid; p < s;) {
6220       col = j[p];
6221       do {
6222         p++;
6223       } while (p < s && j[p] == col);
6224       Bnnz++;
6225     }
6226     k = s;
6227   }
6228 
6229   /* Allocation according to Atot, Btot, Annz, Bnnz */
6230   PetscCall(PetscMalloc1(Atot, &Aperm));
6231   PetscCall(PetscMalloc1(Btot, &Bperm));
6232   PetscCall(PetscMalloc1(Annz + 1, &Ajmap));
6233   PetscCall(PetscMalloc1(Bnnz + 1, &Bjmap));
6234 
6235   /* Re-scan indices and copy diag/offdiag permutation indices to Aperm, Bperm and also fill Ajmap and Bjmap */
6236   Ajmap[0] = Bjmap[0] = Atot = Btot = Annz = Bnnz = 0;
6237   for (r = 0; r < m; r++) {
6238     k   = rowBegin[r];
6239     mid = rowMid[r];
6240     s   = rowEnd[r];
6241     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(Aperm, Atot), PetscSafePointerPlusOffset(perm, k), mid - k));
6242     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(Bperm, Btot), PetscSafePointerPlusOffset(perm, mid), s - mid));
6243     Atot += mid - k;
6244     Btot += s - mid;
6245 
6246     /* Scan column indices in this row and find out how many repeats each unique nonzero has */
6247     for (p = k; p < mid;) {
6248       col = j[p];
6249       q   = p;
6250       do {
6251         p++;
6252       } while (p < mid && j[p] == col);
6253       Ajmap[Annz + 1] = Ajmap[Annz] + (p - q);
6254       Annz++;
6255     }
6256 
6257     for (p = mid; p < s;) {
6258       col = j[p];
6259       q   = p;
6260       do {
6261         p++;
6262       } while (p < s && j[p] == col);
6263       Bjmap[Bnnz + 1] = Bjmap[Bnnz] + (p - q);
6264       Bnnz++;
6265     }
6266   }
6267   /* Output */
6268   *Aperm_ = Aperm;
6269   *Annz_  = Annz;
6270   *Atot_  = Atot;
6271   *Ajmap_ = Ajmap;
6272   *Bperm_ = Bperm;
6273   *Bnnz_  = Bnnz;
6274   *Btot_  = Btot;
6275   *Bjmap_ = Bjmap;
6276   PetscFunctionReturn(PETSC_SUCCESS);
6277 }
6278 
6279 /*
6280   Expand the jmap[] array to make a new one in view of nonzeros in the merged matrix
6281 
6282   Input Parameters:
6283     nnz1: number of unique nonzeros in a set that was used to produce imap[], jmap[]
6284     nnz:  number of unique nonzeros in the merged matrix
6285     imap[nnz1]: i-th nonzero in the set is the imap[i]-th nonzero in the merged matrix
6286     jmap[nnz1+1]: i-th nonzero in the set has jmap[i+1] - jmap[i] repeats in the set
6287 
6288   Output Parameter: (memory is allocated by the caller)
6289     jmap_new[nnz+1]: i-th nonzero in the merged matrix has jmap_new[i+1] - jmap_new[i] repeats in the set
6290 
6291   Example:
6292     nnz1 = 4
6293     nnz  = 6
6294     imap = [1,3,4,5]
6295     jmap = [0,3,5,6,7]
6296    then,
6297     jmap_new = [0,0,3,3,5,6,7]
6298 */
6299 static PetscErrorCode ExpandJmap_Internal(PetscCount nnz1, PetscCount nnz, const PetscCount imap[], const PetscCount jmap[], PetscCount jmap_new[])
6300 {
6301   PetscCount k, p;
6302 
6303   PetscFunctionBegin;
6304   jmap_new[0] = 0;
6305   p           = nnz;                /* p loops over jmap_new[] backwards */
6306   for (k = nnz1 - 1; k >= 0; k--) { /* k loops over imap[] */
6307     for (; p > imap[k]; p--) jmap_new[p] = jmap[k + 1];
6308   }
6309   for (; p >= 0; p--) jmap_new[p] = jmap[0];
6310   PetscFunctionReturn(PETSC_SUCCESS);
6311 }
6312 
6313 static PetscErrorCode MatCOOStructDestroy_MPIAIJ(void **data)
6314 {
6315   MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)*data;
6316 
6317   PetscFunctionBegin;
6318   PetscCall(PetscSFDestroy(&coo->sf));
6319   PetscCall(PetscFree(coo->Aperm1));
6320   PetscCall(PetscFree(coo->Bperm1));
6321   PetscCall(PetscFree(coo->Ajmap1));
6322   PetscCall(PetscFree(coo->Bjmap1));
6323   PetscCall(PetscFree(coo->Aimap2));
6324   PetscCall(PetscFree(coo->Bimap2));
6325   PetscCall(PetscFree(coo->Aperm2));
6326   PetscCall(PetscFree(coo->Bperm2));
6327   PetscCall(PetscFree(coo->Ajmap2));
6328   PetscCall(PetscFree(coo->Bjmap2));
6329   PetscCall(PetscFree(coo->Cperm1));
6330   PetscCall(PetscFree2(coo->sendbuf, coo->recvbuf));
6331   PetscCall(PetscFree(coo));
6332   PetscFunctionReturn(PETSC_SUCCESS);
6333 }
6334 
6335 PetscErrorCode MatSetPreallocationCOO_MPIAIJ(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
6336 {
6337   MPI_Comm             comm;
6338   PetscMPIInt          rank, size;
6339   PetscInt             m, n, M, N, rstart, rend, cstart, cend; /* Sizes, indices of row/col, therefore with type PetscInt */
6340   PetscCount           k, p, q, rem;                           /* Loop variables over coo arrays */
6341   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
6342   PetscContainer       container;
6343   MatCOOStruct_MPIAIJ *coo;
6344 
6345   PetscFunctionBegin;
6346   PetscCall(PetscFree(mpiaij->garray));
6347   PetscCall(VecDestroy(&mpiaij->lvec));
6348 #if defined(PETSC_USE_CTABLE)
6349   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
6350 #else
6351   PetscCall(PetscFree(mpiaij->colmap));
6352 #endif
6353   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
6354   mat->assembled     = PETSC_FALSE;
6355   mat->was_assembled = PETSC_FALSE;
6356 
6357   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
6358   PetscCallMPI(MPI_Comm_size(comm, &size));
6359   PetscCallMPI(MPI_Comm_rank(comm, &rank));
6360   PetscCall(PetscLayoutSetUp(mat->rmap));
6361   PetscCall(PetscLayoutSetUp(mat->cmap));
6362   PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
6363   PetscCall(PetscLayoutGetRange(mat->cmap, &cstart, &cend));
6364   PetscCall(MatGetLocalSize(mat, &m, &n));
6365   PetscCall(MatGetSize(mat, &M, &N));
6366 
6367   /* Sort (i,j) by row along with a permutation array, so that the to-be-ignored */
6368   /* entries come first, then local rows, then remote rows.                     */
6369   PetscCount n1 = coo_n, *perm1;
6370   PetscInt  *i1 = coo_i, *j1 = coo_j;
6371 
6372   PetscCall(PetscMalloc1(n1, &perm1));
6373   for (k = 0; k < n1; k++) perm1[k] = k;
6374 
6375   /* Manipulate indices so that entries with negative row or col indices will have smallest
6376      row indices, local entries will have greater but negative row indices, and remote entries
6377      will have positive row indices.
6378   */
6379   for (k = 0; k < n1; k++) {
6380     if (i1[k] < 0 || j1[k] < 0) i1[k] = PETSC_INT_MIN;                /* e.g., -2^31, minimal to move them ahead */
6381     else if (i1[k] >= rstart && i1[k] < rend) i1[k] -= PETSC_INT_MAX; /* e.g., minus 2^31-1 to shift local rows to range of [-PETSC_INT_MAX, -1] */
6382     else {
6383       PetscCheck(!mat->nooffprocentries, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "MAT_NO_OFF_PROC_ENTRIES is set but insert to remote rows");
6384       if (mpiaij->donotstash) i1[k] = PETSC_INT_MIN; /* Ignore offproc entries as if they had negative indices */
6385     }
6386   }
6387 
6388   /* Sort by row; after that, [0,k) have ignored entries, [k,rem) have local rows and [rem,n1) have remote rows */
6389   PetscCall(PetscSortIntWithIntCountArrayPair(n1, i1, j1, perm1));
6390 
6391   /* Advance k to the first entry we need to take care of */
6392   for (k = 0; k < n1; k++)
6393     if (i1[k] > PETSC_INT_MIN) break;
6394   PetscCount i1start = k;
6395 
6396   PetscCall(PetscSortedIntUpperBound(i1, k, n1, rend - 1 - PETSC_INT_MAX, &rem)); /* rem is upper bound of the last local row */
6397   for (; k < rem; k++) i1[k] += PETSC_INT_MAX;                                    /* Revert row indices of local rows*/
6398 
6399   PetscCheck(n1 == 0 || i1[n1 - 1] < M, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "COO row index %" PetscInt_FMT " is >= the matrix row size %" PetscInt_FMT, i1[n1 - 1], M);
6400 
6401   /*           Send remote rows to their owner                                  */
6402   /* Find which rows should be sent to which remote ranks*/
6403   PetscInt        nsend = 0; /* Number of MPI ranks to send data to */
6404   PetscMPIInt    *sendto;    /* [nsend], storing remote ranks */
6405   PetscInt       *nentries;  /* [nsend], storing number of entries sent to remote ranks; Assume PetscInt is big enough for this count, and error if not */
6406   const PetscInt *ranges;
6407   PetscInt        maxNsend = size >= 128 ? 128 : size; /* Assume max 128 neighbors; realloc when needed */
6408 
6409   PetscCall(PetscLayoutGetRanges(mat->rmap, &ranges));
6410   PetscCall(PetscMalloc2(maxNsend, &sendto, maxNsend, &nentries));
6411   for (k = rem; k < n1;) {
6412     PetscMPIInt owner;
6413     PetscInt    firstRow, lastRow;
6414 
6415     /* Locate a row range */
6416     firstRow = i1[k]; /* first row of this owner */
6417     PetscCall(PetscLayoutFindOwner(mat->rmap, firstRow, &owner));
6418     lastRow = ranges[owner + 1] - 1; /* last row of this owner */
6419 
6420     /* Find the first index 'p' in [k,n) with i1[p] belonging to next owner */
6421     PetscCall(PetscSortedIntUpperBound(i1, k, n1, lastRow, &p));
6422 
6423     /* All entries in [k,p) belong to this remote owner */
6424     if (nsend >= maxNsend) { /* Double the remote ranks arrays if not long enough */
6425       PetscMPIInt *sendto2;
6426       PetscInt    *nentries2;
6427       PetscInt     maxNsend2 = (maxNsend <= size / 2) ? maxNsend * 2 : size;
6428 
6429       PetscCall(PetscMalloc2(maxNsend2, &sendto2, maxNsend2, &nentries2));
6430       PetscCall(PetscArraycpy(sendto2, sendto, maxNsend));
6431       PetscCall(PetscArraycpy(nentries2, nentries2, maxNsend + 1));
6432       PetscCall(PetscFree2(sendto, nentries2));
6433       sendto   = sendto2;
6434       nentries = nentries2;
6435       maxNsend = maxNsend2;
6436     }
6437     sendto[nsend] = owner;
6438     PetscCall(PetscIntCast(p - k, &nentries[nsend]));
6439     nsend++;
6440     k = p;
6441   }
6442 
6443   /* Build 1st SF to know offsets on remote to send data */
6444   PetscSF      sf1;
6445   PetscInt     nroots = 1, nroots2 = 0;
6446   PetscInt     nleaves = nsend, nleaves2 = 0;
6447   PetscInt    *offsets;
6448   PetscSFNode *iremote;
6449 
6450   PetscCall(PetscSFCreate(comm, &sf1));
6451   PetscCall(PetscMalloc1(nsend, &iremote));
6452   PetscCall(PetscMalloc1(nsend, &offsets));
6453   for (k = 0; k < nsend; k++) {
6454     iremote[k].rank  = sendto[k];
6455     iremote[k].index = 0;
6456     nleaves2 += nentries[k];
6457     PetscCheck(nleaves2 >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Number of SF leaves is too large for PetscInt");
6458   }
6459   PetscCall(PetscSFSetGraph(sf1, nroots, nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
6460   PetscCall(PetscSFFetchAndOpWithMemTypeBegin(sf1, MPIU_INT, PETSC_MEMTYPE_HOST, &nroots2 /*rootdata*/, PETSC_MEMTYPE_HOST, nentries /*leafdata*/, PETSC_MEMTYPE_HOST, offsets /*leafupdate*/, MPI_SUM));
6461   PetscCall(PetscSFFetchAndOpEnd(sf1, MPIU_INT, &nroots2, nentries, offsets, MPI_SUM)); /* Would nroots2 overflow, we check offsets[] below */
6462   PetscCall(PetscSFDestroy(&sf1));
6463   PetscAssert(nleaves2 == n1 - rem, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nleaves2 %" PetscInt_FMT " != number of remote entries %" PetscCount_FMT, nleaves2, n1 - rem);
6464 
6465   /* Build 2nd SF to send remote COOs to their owner */
6466   PetscSF sf2;
6467   nroots  = nroots2;
6468   nleaves = nleaves2;
6469   PetscCall(PetscSFCreate(comm, &sf2));
6470   PetscCall(PetscSFSetFromOptions(sf2));
6471   PetscCall(PetscMalloc1(nleaves, &iremote));
6472   p = 0;
6473   for (k = 0; k < nsend; k++) {
6474     PetscCheck(offsets[k] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Number of SF roots is too large for PetscInt");
6475     for (q = 0; q < nentries[k]; q++, p++) {
6476       iremote[p].rank = sendto[k];
6477       PetscCall(PetscIntCast(offsets[k] + q, &iremote[p].index));
6478     }
6479   }
6480   PetscCall(PetscSFSetGraph(sf2, nroots, nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
6481 
6482   /* Send the remote COOs to their owner */
6483   PetscInt    n2 = nroots, *i2, *j2; /* Buffers for received COOs from other ranks, along with a permutation array */
6484   PetscCount *perm2;                 /* Though PetscInt is enough for remote entries, we use PetscCount here as we want to reuse MatSplitEntries_Internal() */
6485   PetscCall(PetscMalloc3(n2, &i2, n2, &j2, n2, &perm2));
6486   PetscAssert(rem == 0 || i1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6487   PetscAssert(rem == 0 || j1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6488   PetscInt *i1prem = PetscSafePointerPlusOffset(i1, rem);
6489   PetscInt *j1prem = PetscSafePointerPlusOffset(j1, rem);
6490   PetscCall(PetscSFReduceWithMemTypeBegin(sf2, MPIU_INT, PETSC_MEMTYPE_HOST, i1prem, PETSC_MEMTYPE_HOST, i2, MPI_REPLACE));
6491   PetscCall(PetscSFReduceEnd(sf2, MPIU_INT, i1prem, i2, MPI_REPLACE));
6492   PetscCall(PetscSFReduceWithMemTypeBegin(sf2, MPIU_INT, PETSC_MEMTYPE_HOST, j1prem, PETSC_MEMTYPE_HOST, j2, MPI_REPLACE));
6493   PetscCall(PetscSFReduceEnd(sf2, MPIU_INT, j1prem, j2, MPI_REPLACE));
6494 
6495   PetscCall(PetscFree(offsets));
6496   PetscCall(PetscFree2(sendto, nentries));
6497 
6498   /* Sort received COOs by row along with the permutation array     */
6499   for (k = 0; k < n2; k++) perm2[k] = k;
6500   PetscCall(PetscSortIntWithIntCountArrayPair(n2, i2, j2, perm2));
6501 
6502   /* sf2 only sends contiguous leafdata to contiguous rootdata. We record the permutation which will be used to fill leafdata */
6503   PetscCount *Cperm1;
6504   PetscAssert(rem == 0 || perm1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6505   PetscCount *perm1prem = PetscSafePointerPlusOffset(perm1, rem);
6506   PetscCall(PetscMalloc1(nleaves, &Cperm1));
6507   PetscCall(PetscArraycpy(Cperm1, perm1prem, nleaves));
6508 
6509   /* Support for HYPRE matrices, kind of a hack.
6510      Swap min column with diagonal so that diagonal values will go first */
6511   PetscBool hypre;
6512   PetscCall(PetscStrcmp("_internal_COO_mat_for_hypre", ((PetscObject)mat)->name, &hypre));
6513   if (hypre) {
6514     PetscInt *minj;
6515     PetscBT   hasdiag;
6516 
6517     PetscCall(PetscBTCreate(m, &hasdiag));
6518     PetscCall(PetscMalloc1(m, &minj));
6519     for (k = 0; k < m; k++) minj[k] = PETSC_INT_MAX;
6520     for (k = i1start; k < rem; k++) {
6521       if (j1[k] < cstart || j1[k] >= cend) continue;
6522       const PetscInt rindex = i1[k] - rstart;
6523       if ((j1[k] - cstart) == rindex) PetscCall(PetscBTSet(hasdiag, rindex));
6524       minj[rindex] = PetscMin(minj[rindex], j1[k]);
6525     }
6526     for (k = 0; k < n2; k++) {
6527       if (j2[k] < cstart || j2[k] >= cend) continue;
6528       const PetscInt rindex = i2[k] - rstart;
6529       if ((j2[k] - cstart) == rindex) PetscCall(PetscBTSet(hasdiag, rindex));
6530       minj[rindex] = PetscMin(minj[rindex], j2[k]);
6531     }
6532     for (k = i1start; k < rem; k++) {
6533       const PetscInt rindex = i1[k] - rstart;
6534       if (j1[k] < cstart || j1[k] >= cend || !PetscBTLookup(hasdiag, rindex)) continue;
6535       if (j1[k] == minj[rindex]) j1[k] = i1[k] + (cstart - rstart);
6536       else if ((j1[k] - cstart) == rindex) j1[k] = minj[rindex];
6537     }
6538     for (k = 0; k < n2; k++) {
6539       const PetscInt rindex = i2[k] - rstart;
6540       if (j2[k] < cstart || j2[k] >= cend || !PetscBTLookup(hasdiag, rindex)) continue;
6541       if (j2[k] == minj[rindex]) j2[k] = i2[k] + (cstart - rstart);
6542       else if ((j2[k] - cstart) == rindex) j2[k] = minj[rindex];
6543     }
6544     PetscCall(PetscBTDestroy(&hasdiag));
6545     PetscCall(PetscFree(minj));
6546   }
6547 
6548   /* Split local COOs and received COOs into diag/offdiag portions */
6549   PetscCount *rowBegin1, *rowMid1, *rowEnd1;
6550   PetscCount *Ajmap1, *Aperm1, *Bjmap1, *Bperm1;
6551   PetscCount  Annz1, Bnnz1, Atot1, Btot1;
6552   PetscCount *rowBegin2, *rowMid2, *rowEnd2;
6553   PetscCount *Ajmap2, *Aperm2, *Bjmap2, *Bperm2;
6554   PetscCount  Annz2, Bnnz2, Atot2, Btot2;
6555 
6556   PetscCall(PetscCalloc3(m, &rowBegin1, m, &rowMid1, m, &rowEnd1));
6557   PetscCall(PetscCalloc3(m, &rowBegin2, m, &rowMid2, m, &rowEnd2));
6558   PetscCall(MatSplitEntries_Internal(mat, rem, i1, j1, perm1, rowBegin1, rowMid1, rowEnd1, &Atot1, &Aperm1, &Annz1, &Ajmap1, &Btot1, &Bperm1, &Bnnz1, &Bjmap1));
6559   PetscCall(MatSplitEntries_Internal(mat, n2, i2, j2, perm2, rowBegin2, rowMid2, rowEnd2, &Atot2, &Aperm2, &Annz2, &Ajmap2, &Btot2, &Bperm2, &Bnnz2, &Bjmap2));
6560 
6561   /* Merge local COOs with received COOs: diag with diag, offdiag with offdiag */
6562   PetscInt *Ai, *Bi;
6563   PetscInt *Aj, *Bj;
6564 
6565   PetscCall(PetscMalloc1(m + 1, &Ai));
6566   PetscCall(PetscMalloc1(m + 1, &Bi));
6567   PetscCall(PetscMalloc1(Annz1 + Annz2, &Aj)); /* Since local and remote entries might have dups, we might allocate excess memory */
6568   PetscCall(PetscMalloc1(Bnnz1 + Bnnz2, &Bj));
6569 
6570   PetscCount *Aimap1, *Bimap1, *Aimap2, *Bimap2;
6571   PetscCall(PetscMalloc1(Annz1, &Aimap1));
6572   PetscCall(PetscMalloc1(Bnnz1, &Bimap1));
6573   PetscCall(PetscMalloc1(Annz2, &Aimap2));
6574   PetscCall(PetscMalloc1(Bnnz2, &Bimap2));
6575 
6576   PetscCall(MatMergeEntries_Internal(mat, j1, j2, rowBegin1, rowMid1, rowBegin2, rowMid2, Ajmap1, Ajmap2, Aimap1, Aimap2, Ai, Aj));
6577   PetscCall(MatMergeEntries_Internal(mat, j1, j2, rowMid1, rowEnd1, rowMid2, rowEnd2, Bjmap1, Bjmap2, Bimap1, Bimap2, Bi, Bj));
6578 
6579   /* Expand Ajmap1/Bjmap1 to make them based off nonzeros in A/B, since we     */
6580   /* expect nonzeros in A/B most likely have local contributing entries        */
6581   PetscInt    Annz = Ai[m];
6582   PetscInt    Bnnz = Bi[m];
6583   PetscCount *Ajmap1_new, *Bjmap1_new;
6584 
6585   PetscCall(PetscMalloc1(Annz + 1, &Ajmap1_new));
6586   PetscCall(PetscMalloc1(Bnnz + 1, &Bjmap1_new));
6587 
6588   PetscCall(ExpandJmap_Internal(Annz1, Annz, Aimap1, Ajmap1, Ajmap1_new));
6589   PetscCall(ExpandJmap_Internal(Bnnz1, Bnnz, Bimap1, Bjmap1, Bjmap1_new));
6590 
6591   PetscCall(PetscFree(Aimap1));
6592   PetscCall(PetscFree(Ajmap1));
6593   PetscCall(PetscFree(Bimap1));
6594   PetscCall(PetscFree(Bjmap1));
6595   PetscCall(PetscFree3(rowBegin1, rowMid1, rowEnd1));
6596   PetscCall(PetscFree3(rowBegin2, rowMid2, rowEnd2));
6597   PetscCall(PetscFree(perm1));
6598   PetscCall(PetscFree3(i2, j2, perm2));
6599 
6600   Ajmap1 = Ajmap1_new;
6601   Bjmap1 = Bjmap1_new;
6602 
6603   /* Reallocate Aj, Bj once we know actual numbers of unique nonzeros in A and B */
6604   if (Annz < Annz1 + Annz2) {
6605     PetscInt *Aj_new;
6606     PetscCall(PetscMalloc1(Annz, &Aj_new));
6607     PetscCall(PetscArraycpy(Aj_new, Aj, Annz));
6608     PetscCall(PetscFree(Aj));
6609     Aj = Aj_new;
6610   }
6611 
6612   if (Bnnz < Bnnz1 + Bnnz2) {
6613     PetscInt *Bj_new;
6614     PetscCall(PetscMalloc1(Bnnz, &Bj_new));
6615     PetscCall(PetscArraycpy(Bj_new, Bj, Bnnz));
6616     PetscCall(PetscFree(Bj));
6617     Bj = Bj_new;
6618   }
6619 
6620   /* Create new submatrices for on-process and off-process coupling                  */
6621   PetscScalar     *Aa, *Ba;
6622   MatType          rtype;
6623   Mat_SeqAIJ      *a, *b;
6624   PetscObjectState state;
6625   PetscCall(PetscCalloc1(Annz, &Aa)); /* Zero matrix on device */
6626   PetscCall(PetscCalloc1(Bnnz, &Ba));
6627   /* make Aj[] local, i.e, based off the start column of the diagonal portion */
6628   if (cstart) {
6629     for (k = 0; k < Annz; k++) Aj[k] -= cstart;
6630   }
6631 
6632   PetscCall(MatGetRootType_Private(mat, &rtype));
6633 
6634   MatSeqXAIJGetOptions_Private(mpiaij->A);
6635   PetscCall(MatDestroy(&mpiaij->A));
6636   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, n, Ai, Aj, Aa, &mpiaij->A));
6637   PetscCall(MatSetBlockSizesFromMats(mpiaij->A, mat, mat));
6638   MatSeqXAIJRestoreOptions_Private(mpiaij->A);
6639 
6640   MatSeqXAIJGetOptions_Private(mpiaij->B);
6641   PetscCall(MatDestroy(&mpiaij->B));
6642   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, mat->cmap->N, Bi, Bj, Ba, &mpiaij->B));
6643   PetscCall(MatSetBlockSizesFromMats(mpiaij->B, mat, mat));
6644   MatSeqXAIJRestoreOptions_Private(mpiaij->B);
6645 
6646   PetscCall(MatSetUpMultiply_MPIAIJ(mat));
6647   mat->was_assembled = PETSC_TRUE; // was_assembled in effect means the Mvctx is built; doing so avoids redundant MatSetUpMultiply_MPIAIJ
6648   state              = mpiaij->A->nonzerostate + mpiaij->B->nonzerostate;
6649   PetscCallMPI(MPIU_Allreduce(&state, &mat->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)mat)));
6650 
6651   a          = (Mat_SeqAIJ *)mpiaij->A->data;
6652   b          = (Mat_SeqAIJ *)mpiaij->B->data;
6653   a->free_a  = PETSC_TRUE;
6654   a->free_ij = PETSC_TRUE;
6655   b->free_a  = PETSC_TRUE;
6656   b->free_ij = PETSC_TRUE;
6657   a->maxnz   = a->nz;
6658   b->maxnz   = b->nz;
6659 
6660   /* conversion must happen AFTER multiply setup */
6661   PetscCall(MatConvert(mpiaij->A, rtype, MAT_INPLACE_MATRIX, &mpiaij->A));
6662   PetscCall(MatConvert(mpiaij->B, rtype, MAT_INPLACE_MATRIX, &mpiaij->B));
6663   PetscCall(VecDestroy(&mpiaij->lvec));
6664   PetscCall(MatCreateVecs(mpiaij->B, &mpiaij->lvec, NULL));
6665 
6666   // Put the COO struct in a container and then attach that to the matrix
6667   PetscCall(PetscMalloc1(1, &coo));
6668   coo->n       = coo_n;
6669   coo->sf      = sf2;
6670   coo->sendlen = nleaves;
6671   coo->recvlen = nroots;
6672   coo->Annz    = Annz;
6673   coo->Bnnz    = Bnnz;
6674   coo->Annz2   = Annz2;
6675   coo->Bnnz2   = Bnnz2;
6676   coo->Atot1   = Atot1;
6677   coo->Atot2   = Atot2;
6678   coo->Btot1   = Btot1;
6679   coo->Btot2   = Btot2;
6680   coo->Ajmap1  = Ajmap1;
6681   coo->Aperm1  = Aperm1;
6682   coo->Bjmap1  = Bjmap1;
6683   coo->Bperm1  = Bperm1;
6684   coo->Aimap2  = Aimap2;
6685   coo->Ajmap2  = Ajmap2;
6686   coo->Aperm2  = Aperm2;
6687   coo->Bimap2  = Bimap2;
6688   coo->Bjmap2  = Bjmap2;
6689   coo->Bperm2  = Bperm2;
6690   coo->Cperm1  = Cperm1;
6691   // Allocate in preallocation. If not used, it has zero cost on host
6692   PetscCall(PetscMalloc2(coo->sendlen, &coo->sendbuf, coo->recvlen, &coo->recvbuf));
6693   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container));
6694   PetscCall(PetscContainerSetPointer(container, coo));
6695   PetscCall(PetscContainerSetCtxDestroy(container, MatCOOStructDestroy_MPIAIJ));
6696   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject)container));
6697   PetscCall(PetscContainerDestroy(&container));
6698   PetscFunctionReturn(PETSC_SUCCESS);
6699 }
6700 
6701 static PetscErrorCode MatSetValuesCOO_MPIAIJ(Mat mat, const PetscScalar v[], InsertMode imode)
6702 {
6703   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
6704   Mat                  A = mpiaij->A, B = mpiaij->B;
6705   PetscScalar         *Aa, *Ba;
6706   PetscScalar         *sendbuf, *recvbuf;
6707   const PetscCount    *Ajmap1, *Ajmap2, *Aimap2;
6708   const PetscCount    *Bjmap1, *Bjmap2, *Bimap2;
6709   const PetscCount    *Aperm1, *Aperm2, *Bperm1, *Bperm2;
6710   const PetscCount    *Cperm1;
6711   PetscContainer       container;
6712   MatCOOStruct_MPIAIJ *coo;
6713 
6714   PetscFunctionBegin;
6715   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container));
6716   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
6717   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
6718   sendbuf = coo->sendbuf;
6719   recvbuf = coo->recvbuf;
6720   Ajmap1  = coo->Ajmap1;
6721   Ajmap2  = coo->Ajmap2;
6722   Aimap2  = coo->Aimap2;
6723   Bjmap1  = coo->Bjmap1;
6724   Bjmap2  = coo->Bjmap2;
6725   Bimap2  = coo->Bimap2;
6726   Aperm1  = coo->Aperm1;
6727   Aperm2  = coo->Aperm2;
6728   Bperm1  = coo->Bperm1;
6729   Bperm2  = coo->Bperm2;
6730   Cperm1  = coo->Cperm1;
6731 
6732   PetscCall(MatSeqAIJGetArray(A, &Aa)); /* Might read and write matrix values */
6733   PetscCall(MatSeqAIJGetArray(B, &Ba));
6734 
6735   /* Pack entries to be sent to remote */
6736   for (PetscCount i = 0; i < coo->sendlen; i++) sendbuf[i] = v[Cperm1[i]];
6737 
6738   /* Send remote entries to their owner and overlap the communication with local computation */
6739   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_HOST, sendbuf, PETSC_MEMTYPE_HOST, recvbuf, MPI_REPLACE));
6740   /* Add local entries to A and B */
6741   for (PetscCount i = 0; i < coo->Annz; i++) { /* All nonzeros in A are either zero'ed or added with a value (i.e., initialized) */
6742     PetscScalar sum = 0.0;                     /* Do partial summation first to improve numerical stability */
6743     for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += v[Aperm1[k]];
6744     Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
6745   }
6746   for (PetscCount i = 0; i < coo->Bnnz; i++) {
6747     PetscScalar sum = 0.0;
6748     for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += v[Bperm1[k]];
6749     Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
6750   }
6751   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, sendbuf, recvbuf, MPI_REPLACE));
6752 
6753   /* Add received remote entries to A and B */
6754   for (PetscCount i = 0; i < coo->Annz2; i++) {
6755     for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += recvbuf[Aperm2[k]];
6756   }
6757   for (PetscCount i = 0; i < coo->Bnnz2; i++) {
6758     for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += recvbuf[Bperm2[k]];
6759   }
6760   PetscCall(MatSeqAIJRestoreArray(A, &Aa));
6761   PetscCall(MatSeqAIJRestoreArray(B, &Ba));
6762   PetscFunctionReturn(PETSC_SUCCESS);
6763 }
6764 
6765 /*MC
6766    MATMPIAIJ - MATMPIAIJ = "mpiaij" - A matrix type to be used for parallel sparse matrices.
6767 
6768    Options Database Keys:
6769 . -mat_type mpiaij - sets the matrix type to `MATMPIAIJ` during a call to `MatSetFromOptions()`
6770 
6771    Level: beginner
6772 
6773    Notes:
6774    `MatSetValues()` may be called for this matrix type with a `NULL` argument for the numerical values,
6775     in this case the values associated with the rows and columns one passes in are set to zero
6776     in the matrix
6777 
6778     `MatSetOptions`(,`MAT_STRUCTURE_ONLY`,`PETSC_TRUE`) may be called for this matrix type. In this no
6779     space is allocated for the nonzero entries and any entries passed with `MatSetValues()` are ignored
6780 
6781 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJ`, `MATAIJ`, `MatCreateAIJ()`
6782 M*/
6783 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJ(Mat B)
6784 {
6785   Mat_MPIAIJ *b;
6786   PetscMPIInt size;
6787 
6788   PetscFunctionBegin;
6789   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
6790 
6791   PetscCall(PetscNew(&b));
6792   B->data       = (void *)b;
6793   B->ops[0]     = MatOps_Values;
6794   B->assembled  = PETSC_FALSE;
6795   B->insertmode = NOT_SET_VALUES;
6796   b->size       = size;
6797 
6798   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)B), &b->rank));
6799 
6800   /* build cache for off array entries formed */
6801   PetscCall(MatStashCreate_Private(PetscObjectComm((PetscObject)B), 1, &B->stash));
6802 
6803   b->donotstash  = PETSC_FALSE;
6804   b->colmap      = NULL;
6805   b->garray      = NULL;
6806   b->roworiented = PETSC_TRUE;
6807 
6808   /* stuff used for matrix vector multiply */
6809   b->lvec  = NULL;
6810   b->Mvctx = NULL;
6811 
6812   /* stuff for MatGetRow() */
6813   b->rowindices   = NULL;
6814   b->rowvalues    = NULL;
6815   b->getrowactive = PETSC_FALSE;
6816 
6817   /* flexible pointer used in CUSPARSE classes */
6818   b->spptr = NULL;
6819 
6820   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetUseScalableIncreaseOverlap_C", MatMPIAIJSetUseScalableIncreaseOverlap_MPIAIJ));
6821   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_MPIAIJ));
6822   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_MPIAIJ));
6823   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatIsTranspose_C", MatIsTranspose_MPIAIJ));
6824   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJ));
6825   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatResetPreallocation_C", MatResetPreallocation_MPIAIJ));
6826   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatResetHash_C", MatResetHash_MPIAIJ));
6827   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocationCSR_C", MatMPIAIJSetPreallocationCSR_MPIAIJ));
6828   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDiagonalScaleLocal_C", MatDiagonalScaleLocal_MPIAIJ));
6829   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijperm_C", MatConvert_MPIAIJ_MPIAIJPERM));
6830   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijsell_C", MatConvert_MPIAIJ_MPIAIJSELL));
6831 #if defined(PETSC_HAVE_CUDA)
6832   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijcusparse_C", MatConvert_MPIAIJ_MPIAIJCUSPARSE));
6833 #endif
6834 #if defined(PETSC_HAVE_HIP)
6835   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijhipsparse_C", MatConvert_MPIAIJ_MPIAIJHIPSPARSE));
6836 #endif
6837 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
6838   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijkokkos_C", MatConvert_MPIAIJ_MPIAIJKokkos));
6839 #endif
6840 #if defined(PETSC_HAVE_MKL_SPARSE)
6841   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijmkl_C", MatConvert_MPIAIJ_MPIAIJMKL));
6842 #endif
6843   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijcrl_C", MatConvert_MPIAIJ_MPIAIJCRL));
6844   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpibaij_C", MatConvert_MPIAIJ_MPIBAIJ));
6845   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpisbaij_C", MatConvert_MPIAIJ_MPISBAIJ));
6846   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpidense_C", MatConvert_MPIAIJ_MPIDense));
6847 #if defined(PETSC_HAVE_ELEMENTAL)
6848   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_elemental_C", MatConvert_MPIAIJ_Elemental));
6849 #endif
6850 #if defined(PETSC_HAVE_SCALAPACK) && (defined(PETSC_USE_REAL_SINGLE) || defined(PETSC_USE_REAL_DOUBLE))
6851   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_scalapack_C", MatConvert_AIJ_ScaLAPACK));
6852 #endif
6853   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_is_C", MatConvert_XAIJ_IS));
6854   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpisell_C", MatConvert_MPIAIJ_MPISELL));
6855 #if defined(PETSC_HAVE_HYPRE)
6856   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_hypre_C", MatConvert_AIJ_HYPRE));
6857   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_transpose_mpiaij_mpiaij_C", MatProductSetFromOptions_Transpose_AIJ_AIJ));
6858 #endif
6859   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_is_mpiaij_C", MatProductSetFromOptions_IS_XAIJ));
6860   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_mpiaij_mpiaij_C", MatProductSetFromOptions_MPIAIJ));
6861   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJ));
6862   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJ));
6863   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATMPIAIJ));
6864   PetscFunctionReturn(PETSC_SUCCESS);
6865 }
6866 
6867 /*@
6868   MatCreateMPIAIJWithSplitArrays - creates a `MATMPIAIJ` matrix using arrays that contain the "diagonal"
6869   and "off-diagonal" part of the matrix in CSR format.
6870 
6871   Collective
6872 
6873   Input Parameters:
6874 + comm - MPI communicator
6875 . m    - number of local rows (Cannot be `PETSC_DECIDE`)
6876 . n    - This value should be the same as the local size used in creating the
6877          x vector for the matrix-vector product $y = Ax$. (or `PETSC_DECIDE` to have
6878          calculated if `N` is given) For square matrices `n` is almost always `m`.
6879 . M    - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
6880 . N    - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
6881 . i    - row indices for "diagonal" portion of matrix; that is i[0] = 0, i[row] = i[row-1] + number of elements in that row of the matrix
6882 . j    - column indices, which must be local, i.e., based off the start column of the diagonal portion
6883 . a    - matrix values
6884 . oi   - row indices for "off-diagonal" portion of matrix; that is oi[0] = 0, oi[row] = oi[row-1] + number of elements in that row of the matrix
6885 . oj   - column indices, which must be global, representing global columns in the `MATMPIAIJ` matrix
6886 - oa   - matrix values
6887 
6888   Output Parameter:
6889 . mat - the matrix
6890 
6891   Level: advanced
6892 
6893   Notes:
6894   The `i`, `j`, and `a` arrays ARE NOT copied by this routine into the internal format used by PETSc (even in Fortran). The user
6895   must free the arrays once the matrix has been destroyed and not before.
6896 
6897   The `i` and `j` indices are 0 based
6898 
6899   See `MatCreateAIJ()` for the definition of "diagonal" and "off-diagonal" portion of the matrix
6900 
6901   This sets local rows and cannot be used to set off-processor values.
6902 
6903   Use of this routine is discouraged because it is inflexible and cumbersome to use. It is extremely rare that a
6904   legacy application natively assembles into exactly this split format. The code to do so is nontrivial and does
6905   not easily support in-place reassembly. It is recommended to use MatSetValues() (or a variant thereof) because
6906   the resulting assembly is easier to implement, will work with any matrix format, and the user does not have to
6907   keep track of the underlying array. Use `MatSetOption`(A,`MAT_NO_OFF_PROC_ENTRIES`,`PETSC_TRUE`) to disable all
6908   communication if it is known that only local entries will be set.
6909 
6910 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
6911           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithArrays()`
6912 @*/
6913 PetscErrorCode MatCreateMPIAIJWithSplitArrays(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt i[], PetscInt j[], PetscScalar a[], PetscInt oi[], PetscInt oj[], PetscScalar oa[], Mat *mat)
6914 {
6915   Mat_MPIAIJ *maij;
6916 
6917   PetscFunctionBegin;
6918   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
6919   PetscCheck(i[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
6920   PetscCheck(oi[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "oi (row indices) must start with 0");
6921   PetscCall(MatCreate(comm, mat));
6922   PetscCall(MatSetSizes(*mat, m, n, M, N));
6923   PetscCall(MatSetType(*mat, MATMPIAIJ));
6924   maij = (Mat_MPIAIJ *)(*mat)->data;
6925 
6926   (*mat)->preallocated = PETSC_TRUE;
6927 
6928   PetscCall(PetscLayoutSetUp((*mat)->rmap));
6929   PetscCall(PetscLayoutSetUp((*mat)->cmap));
6930 
6931   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, n, i, j, a, &maij->A));
6932   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, (*mat)->cmap->N, oi, oj, oa, &maij->B));
6933 
6934   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
6935   PetscCall(MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY));
6936   PetscCall(MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY));
6937   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE));
6938   PetscCall(MatSetOption(*mat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
6939   PetscFunctionReturn(PETSC_SUCCESS);
6940 }
6941 
6942 typedef struct {
6943   Mat       *mp;    /* intermediate products */
6944   PetscBool *mptmp; /* is the intermediate product temporary ? */
6945   PetscInt   cp;    /* number of intermediate products */
6946 
6947   /* support for MatGetBrowsOfAoCols_MPIAIJ for P_oth */
6948   PetscInt    *startsj_s, *startsj_r;
6949   PetscScalar *bufa;
6950   Mat          P_oth;
6951 
6952   /* may take advantage of merging product->B */
6953   Mat Bloc; /* B-local by merging diag and off-diag */
6954 
6955   /* cusparse does not have support to split between symbolic and numeric phases.
6956      When api_user is true, we don't need to update the numerical values
6957      of the temporary storage */
6958   PetscBool reusesym;
6959 
6960   /* support for COO values insertion */
6961   PetscScalar *coo_v, *coo_w; /* store on-process and off-process COO scalars, and used as MPI recv/send buffers respectively */
6962   PetscInt   **own;           /* own[i] points to address of on-process COO indices for Mat mp[i] */
6963   PetscInt   **off;           /* off[i] points to address of off-process COO indices for Mat mp[i] */
6964   PetscBool    hasoffproc;    /* if true, have off-process values insertion (i.e. AtB or PtAP) */
6965   PetscSF      sf;            /* used for non-local values insertion and memory malloc */
6966   PetscMemType mtype;
6967 
6968   /* customization */
6969   PetscBool abmerge;
6970   PetscBool P_oth_bind;
6971 } MatMatMPIAIJBACKEND;
6972 
6973 static PetscErrorCode MatProductCtxDestroy_MatMatMPIAIJBACKEND(void **data)
6974 {
6975   MatMatMPIAIJBACKEND *mmdata = *(MatMatMPIAIJBACKEND **)data;
6976   PetscInt             i;
6977 
6978   PetscFunctionBegin;
6979   PetscCall(PetscFree2(mmdata->startsj_s, mmdata->startsj_r));
6980   PetscCall(PetscFree(mmdata->bufa));
6981   PetscCall(PetscSFFree(mmdata->sf, mmdata->mtype, mmdata->coo_v));
6982   PetscCall(PetscSFFree(mmdata->sf, mmdata->mtype, mmdata->coo_w));
6983   PetscCall(MatDestroy(&mmdata->P_oth));
6984   PetscCall(MatDestroy(&mmdata->Bloc));
6985   PetscCall(PetscSFDestroy(&mmdata->sf));
6986   for (i = 0; i < mmdata->cp; i++) PetscCall(MatDestroy(&mmdata->mp[i]));
6987   PetscCall(PetscFree2(mmdata->mp, mmdata->mptmp));
6988   PetscCall(PetscFree(mmdata->own[0]));
6989   PetscCall(PetscFree(mmdata->own));
6990   PetscCall(PetscFree(mmdata->off[0]));
6991   PetscCall(PetscFree(mmdata->off));
6992   PetscCall(PetscFree(mmdata));
6993   PetscFunctionReturn(PETSC_SUCCESS);
6994 }
6995 
6996 /* Copy selected n entries with indices in idx[] of A to v[].
6997    If idx is NULL, copy the whole data array of A to v[]
6998  */
6999 static PetscErrorCode MatSeqAIJCopySubArray(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
7000 {
7001   PetscErrorCode (*f)(Mat, PetscInt, const PetscInt[], PetscScalar[]);
7002 
7003   PetscFunctionBegin;
7004   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", &f));
7005   if (f) {
7006     PetscCall((*f)(A, n, idx, v));
7007   } else {
7008     const PetscScalar *vv;
7009 
7010     PetscCall(MatSeqAIJGetArrayRead(A, &vv));
7011     if (n && idx) {
7012       PetscScalar    *w  = v;
7013       const PetscInt *oi = idx;
7014       PetscInt        j;
7015 
7016       for (j = 0; j < n; j++) *w++ = vv[*oi++];
7017     } else {
7018       PetscCall(PetscArraycpy(v, vv, n));
7019     }
7020     PetscCall(MatSeqAIJRestoreArrayRead(A, &vv));
7021   }
7022   PetscFunctionReturn(PETSC_SUCCESS);
7023 }
7024 
7025 static PetscErrorCode MatProductNumeric_MPIAIJBACKEND(Mat C)
7026 {
7027   MatMatMPIAIJBACKEND *mmdata;
7028   PetscInt             i, n_d, n_o;
7029 
7030   PetscFunctionBegin;
7031   MatCheckProduct(C, 1);
7032   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
7033   mmdata = (MatMatMPIAIJBACKEND *)C->product->data;
7034   if (!mmdata->reusesym) { /* update temporary matrices */
7035     if (mmdata->P_oth) PetscCall(MatGetBrowsOfAoCols_MPIAIJ(C->product->A, C->product->B, MAT_REUSE_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7036     if (mmdata->Bloc) PetscCall(MatMPIAIJGetLocalMatMerge(C->product->B, MAT_REUSE_MATRIX, NULL, &mmdata->Bloc));
7037   }
7038   mmdata->reusesym = PETSC_FALSE;
7039 
7040   for (i = 0; i < mmdata->cp; i++) {
7041     PetscCheck(mmdata->mp[i]->ops->productnumeric, PetscObjectComm((PetscObject)mmdata->mp[i]), PETSC_ERR_PLIB, "Missing numeric op for %s", MatProductTypes[mmdata->mp[i]->product->type]);
7042     PetscCall((*mmdata->mp[i]->ops->productnumeric)(mmdata->mp[i]));
7043   }
7044   for (i = 0, n_d = 0, n_o = 0; i < mmdata->cp; i++) {
7045     PetscInt noff;
7046 
7047     PetscCall(PetscIntCast(mmdata->off[i + 1] - mmdata->off[i], &noff));
7048     if (mmdata->mptmp[i]) continue;
7049     if (noff) {
7050       PetscInt nown;
7051 
7052       PetscCall(PetscIntCast(mmdata->own[i + 1] - mmdata->own[i], &nown));
7053       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], noff, mmdata->off[i], mmdata->coo_w + n_o));
7054       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], nown, mmdata->own[i], mmdata->coo_v + n_d));
7055       n_o += noff;
7056       n_d += nown;
7057     } else {
7058       Mat_SeqAIJ *mm = (Mat_SeqAIJ *)mmdata->mp[i]->data;
7059 
7060       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], mm->nz, NULL, mmdata->coo_v + n_d));
7061       n_d += mm->nz;
7062     }
7063   }
7064   if (mmdata->hasoffproc) { /* offprocess insertion */
7065     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_SCALAR, mmdata->coo_w, mmdata->coo_v + n_d));
7066     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_SCALAR, mmdata->coo_w, mmdata->coo_v + n_d));
7067   }
7068   PetscCall(MatSetValuesCOO(C, mmdata->coo_v, INSERT_VALUES));
7069   PetscFunctionReturn(PETSC_SUCCESS);
7070 }
7071 
7072 /* Support for Pt * A, A * P, or Pt * A * P */
7073 #define MAX_NUMBER_INTERMEDIATE 4
7074 PetscErrorCode MatProductSymbolic_MPIAIJBACKEND(Mat C)
7075 {
7076   Mat_Product           *product = C->product;
7077   Mat                    A, P, mp[MAX_NUMBER_INTERMEDIATE]; /* A, P and a series of intermediate matrices */
7078   Mat_MPIAIJ            *a, *p;
7079   MatMatMPIAIJBACKEND   *mmdata;
7080   ISLocalToGlobalMapping P_oth_l2g = NULL;
7081   IS                     glob      = NULL;
7082   const char            *prefix;
7083   char                   pprefix[256];
7084   const PetscInt        *globidx, *P_oth_idx;
7085   PetscInt               i, j, cp, m, n, M, N, *coo_i, *coo_j;
7086   PetscCount             ncoo, ncoo_d, ncoo_o, ncoo_oown;
7087   PetscInt               cmapt[MAX_NUMBER_INTERMEDIATE], rmapt[MAX_NUMBER_INTERMEDIATE]; /* col/row map type for each Mat in mp[]. */
7088                                                                                          /* type-0: consecutive, start from 0; type-1: consecutive with */
7089                                                                                          /* a base offset; type-2: sparse with a local to global map table */
7090   const PetscInt *cmapa[MAX_NUMBER_INTERMEDIATE], *rmapa[MAX_NUMBER_INTERMEDIATE];       /* col/row local to global map array (table) for type-2 map type */
7091 
7092   MatProductType ptype;
7093   PetscBool      mptmp[MAX_NUMBER_INTERMEDIATE], hasoffproc = PETSC_FALSE, iscuda, iship, iskokk;
7094   PetscMPIInt    size;
7095 
7096   PetscFunctionBegin;
7097   MatCheckProduct(C, 1);
7098   PetscCheck(!product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty");
7099   ptype = product->type;
7100   if (product->A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
7101     ptype                                          = MATPRODUCT_AB;
7102     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
7103   }
7104   switch (ptype) {
7105   case MATPRODUCT_AB:
7106     A          = product->A;
7107     P          = product->B;
7108     m          = A->rmap->n;
7109     n          = P->cmap->n;
7110     M          = A->rmap->N;
7111     N          = P->cmap->N;
7112     hasoffproc = PETSC_FALSE; /* will not scatter mat product values to other processes */
7113     break;
7114   case MATPRODUCT_AtB:
7115     P          = product->A;
7116     A          = product->B;
7117     m          = P->cmap->n;
7118     n          = A->cmap->n;
7119     M          = P->cmap->N;
7120     N          = A->cmap->N;
7121     hasoffproc = PETSC_TRUE;
7122     break;
7123   case MATPRODUCT_PtAP:
7124     A          = product->A;
7125     P          = product->B;
7126     m          = P->cmap->n;
7127     n          = P->cmap->n;
7128     M          = P->cmap->N;
7129     N          = P->cmap->N;
7130     hasoffproc = PETSC_TRUE;
7131     break;
7132   default:
7133     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
7134   }
7135   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)C), &size));
7136   if (size == 1) hasoffproc = PETSC_FALSE;
7137 
7138   /* defaults */
7139   for (i = 0; i < MAX_NUMBER_INTERMEDIATE; i++) {
7140     mp[i]    = NULL;
7141     mptmp[i] = PETSC_FALSE;
7142     rmapt[i] = -1;
7143     cmapt[i] = -1;
7144     rmapa[i] = NULL;
7145     cmapa[i] = NULL;
7146   }
7147 
7148   /* customization */
7149   PetscCall(PetscNew(&mmdata));
7150   mmdata->reusesym = product->api_user;
7151   if (ptype == MATPRODUCT_AB) {
7152     if (product->api_user) {
7153       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
7154       PetscCall(PetscOptionsBool("-matmatmult_backend_mergeB", "Merge product->B local matrices", "MatMatMult", mmdata->abmerge, &mmdata->abmerge, NULL));
7155       PetscCall(PetscOptionsBool("-matmatmult_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7156       PetscOptionsEnd();
7157     } else {
7158       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
7159       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_mergeB", "Merge product->B local matrices", "MatMatMult", mmdata->abmerge, &mmdata->abmerge, NULL));
7160       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7161       PetscOptionsEnd();
7162     }
7163   } else if (ptype == MATPRODUCT_PtAP) {
7164     if (product->api_user) {
7165       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
7166       PetscCall(PetscOptionsBool("-matptap_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7167       PetscOptionsEnd();
7168     } else {
7169       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
7170       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7171       PetscOptionsEnd();
7172     }
7173   }
7174   a = (Mat_MPIAIJ *)A->data;
7175   p = (Mat_MPIAIJ *)P->data;
7176   PetscCall(MatSetSizes(C, m, n, M, N));
7177   PetscCall(PetscLayoutSetUp(C->rmap));
7178   PetscCall(PetscLayoutSetUp(C->cmap));
7179   PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
7180   PetscCall(MatGetOptionsPrefix(C, &prefix));
7181 
7182   cp = 0;
7183   switch (ptype) {
7184   case MATPRODUCT_AB: /* A * P */
7185     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7186 
7187     /* A_diag * P_local (merged or not) */
7188     if (mmdata->abmerge) { /* P's diagonal and off-diag blocks are merged to one matrix, then multiplied by A_diag */
7189       /* P is product->B */
7190       PetscCall(MatMPIAIJGetLocalMatMerge(P, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7191       PetscCall(MatProductCreate(a->A, mmdata->Bloc, NULL, &mp[cp]));
7192       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7193       PetscCall(MatProductSetFill(mp[cp], product->fill));
7194       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7195       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7196       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7197       mp[cp]->product->api_user = product->api_user;
7198       PetscCall(MatProductSetFromOptions(mp[cp]));
7199       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7200       PetscCall(ISGetIndices(glob, &globidx));
7201       rmapt[cp] = 1;
7202       cmapt[cp] = 2;
7203       cmapa[cp] = globidx;
7204       mptmp[cp] = PETSC_FALSE;
7205       cp++;
7206     } else { /* A_diag * P_diag and A_diag * P_off */
7207       PetscCall(MatProductCreate(a->A, p->A, NULL, &mp[cp]));
7208       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7209       PetscCall(MatProductSetFill(mp[cp], product->fill));
7210       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7211       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7212       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7213       mp[cp]->product->api_user = product->api_user;
7214       PetscCall(MatProductSetFromOptions(mp[cp]));
7215       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7216       rmapt[cp] = 1;
7217       cmapt[cp] = 1;
7218       mptmp[cp] = PETSC_FALSE;
7219       cp++;
7220       PetscCall(MatProductCreate(a->A, p->B, NULL, &mp[cp]));
7221       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7222       PetscCall(MatProductSetFill(mp[cp], product->fill));
7223       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7224       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7225       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7226       mp[cp]->product->api_user = product->api_user;
7227       PetscCall(MatProductSetFromOptions(mp[cp]));
7228       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7229       rmapt[cp] = 1;
7230       cmapt[cp] = 2;
7231       cmapa[cp] = p->garray;
7232       mptmp[cp] = PETSC_FALSE;
7233       cp++;
7234     }
7235 
7236     /* A_off * P_other */
7237     if (mmdata->P_oth) {
7238       PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth, &P_oth_l2g)); /* make P_oth use local col ids */
7239       PetscCall(ISLocalToGlobalMappingGetIndices(P_oth_l2g, &P_oth_idx));
7240       PetscCall(MatSetType(mmdata->P_oth, ((PetscObject)a->B)->type_name));
7241       PetscCall(MatBindToCPU(mmdata->P_oth, mmdata->P_oth_bind));
7242       PetscCall(MatProductCreate(a->B, mmdata->P_oth, NULL, &mp[cp]));
7243       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7244       PetscCall(MatProductSetFill(mp[cp], product->fill));
7245       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7246       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7247       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7248       mp[cp]->product->api_user = product->api_user;
7249       PetscCall(MatProductSetFromOptions(mp[cp]));
7250       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7251       rmapt[cp] = 1;
7252       cmapt[cp] = 2;
7253       cmapa[cp] = P_oth_idx;
7254       mptmp[cp] = PETSC_FALSE;
7255       cp++;
7256     }
7257     break;
7258 
7259   case MATPRODUCT_AtB: /* (P^t * A): P_diag * A_loc + P_off * A_loc */
7260     /* A is product->B */
7261     PetscCall(MatMPIAIJGetLocalMatMerge(A, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7262     if (A == P) { /* when A==P, we can take advantage of the already merged mmdata->Bloc */
7263       PetscCall(MatProductCreate(mmdata->Bloc, mmdata->Bloc, NULL, &mp[cp]));
7264       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7265       PetscCall(MatProductSetFill(mp[cp], product->fill));
7266       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7267       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7268       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7269       mp[cp]->product->api_user = product->api_user;
7270       PetscCall(MatProductSetFromOptions(mp[cp]));
7271       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7272       PetscCall(ISGetIndices(glob, &globidx));
7273       rmapt[cp] = 2;
7274       rmapa[cp] = globidx;
7275       cmapt[cp] = 2;
7276       cmapa[cp] = globidx;
7277       mptmp[cp] = PETSC_FALSE;
7278       cp++;
7279     } else {
7280       PetscCall(MatProductCreate(p->A, mmdata->Bloc, NULL, &mp[cp]));
7281       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7282       PetscCall(MatProductSetFill(mp[cp], product->fill));
7283       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7284       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7285       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7286       mp[cp]->product->api_user = product->api_user;
7287       PetscCall(MatProductSetFromOptions(mp[cp]));
7288       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7289       PetscCall(ISGetIndices(glob, &globidx));
7290       rmapt[cp] = 1;
7291       cmapt[cp] = 2;
7292       cmapa[cp] = globidx;
7293       mptmp[cp] = PETSC_FALSE;
7294       cp++;
7295       PetscCall(MatProductCreate(p->B, mmdata->Bloc, NULL, &mp[cp]));
7296       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7297       PetscCall(MatProductSetFill(mp[cp], product->fill));
7298       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7299       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7300       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7301       mp[cp]->product->api_user = product->api_user;
7302       PetscCall(MatProductSetFromOptions(mp[cp]));
7303       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7304       rmapt[cp] = 2;
7305       rmapa[cp] = p->garray;
7306       cmapt[cp] = 2;
7307       cmapa[cp] = globidx;
7308       mptmp[cp] = PETSC_FALSE;
7309       cp++;
7310     }
7311     break;
7312   case MATPRODUCT_PtAP:
7313     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7314     /* P is product->B */
7315     PetscCall(MatMPIAIJGetLocalMatMerge(P, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7316     PetscCall(MatProductCreate(a->A, mmdata->Bloc, NULL, &mp[cp]));
7317     PetscCall(MatProductSetType(mp[cp], MATPRODUCT_PtAP));
7318     PetscCall(MatProductSetFill(mp[cp], product->fill));
7319     PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7320     PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7321     PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7322     mp[cp]->product->api_user = product->api_user;
7323     PetscCall(MatProductSetFromOptions(mp[cp]));
7324     PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7325     PetscCall(ISGetIndices(glob, &globidx));
7326     rmapt[cp] = 2;
7327     rmapa[cp] = globidx;
7328     cmapt[cp] = 2;
7329     cmapa[cp] = globidx;
7330     mptmp[cp] = PETSC_FALSE;
7331     cp++;
7332     if (mmdata->P_oth) {
7333       PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth, &P_oth_l2g));
7334       PetscCall(ISLocalToGlobalMappingGetIndices(P_oth_l2g, &P_oth_idx));
7335       PetscCall(MatSetType(mmdata->P_oth, ((PetscObject)a->B)->type_name));
7336       PetscCall(MatBindToCPU(mmdata->P_oth, mmdata->P_oth_bind));
7337       PetscCall(MatProductCreate(a->B, mmdata->P_oth, NULL, &mp[cp]));
7338       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7339       PetscCall(MatProductSetFill(mp[cp], product->fill));
7340       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7341       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7342       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7343       mp[cp]->product->api_user = product->api_user;
7344       PetscCall(MatProductSetFromOptions(mp[cp]));
7345       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7346       mptmp[cp] = PETSC_TRUE;
7347       cp++;
7348       PetscCall(MatProductCreate(mmdata->Bloc, mp[1], NULL, &mp[cp]));
7349       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7350       PetscCall(MatProductSetFill(mp[cp], product->fill));
7351       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7352       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7353       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7354       mp[cp]->product->api_user = product->api_user;
7355       PetscCall(MatProductSetFromOptions(mp[cp]));
7356       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7357       rmapt[cp] = 2;
7358       rmapa[cp] = globidx;
7359       cmapt[cp] = 2;
7360       cmapa[cp] = P_oth_idx;
7361       mptmp[cp] = PETSC_FALSE;
7362       cp++;
7363     }
7364     break;
7365   default:
7366     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
7367   }
7368   /* sanity check */
7369   if (size > 1)
7370     for (i = 0; i < cp; i++) PetscCheck(rmapt[i] != 2 || hasoffproc, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected offproc map type for product %" PetscInt_FMT, i);
7371 
7372   PetscCall(PetscMalloc2(cp, &mmdata->mp, cp, &mmdata->mptmp));
7373   for (i = 0; i < cp; i++) {
7374     mmdata->mp[i]    = mp[i];
7375     mmdata->mptmp[i] = mptmp[i];
7376   }
7377   mmdata->cp             = cp;
7378   C->product->data       = mmdata;
7379   C->product->destroy    = MatProductCtxDestroy_MatMatMPIAIJBACKEND;
7380   C->ops->productnumeric = MatProductNumeric_MPIAIJBACKEND;
7381 
7382   /* memory type */
7383   mmdata->mtype = PETSC_MEMTYPE_HOST;
7384   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iscuda, MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, ""));
7385   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iship, MATSEQAIJHIPSPARSE, MATMPIAIJHIPSPARSE, ""));
7386   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iskokk, MATSEQAIJKOKKOS, MATMPIAIJKOKKOS, ""));
7387   if (iscuda) mmdata->mtype = PETSC_MEMTYPE_CUDA;
7388   else if (iship) mmdata->mtype = PETSC_MEMTYPE_HIP;
7389   else if (iskokk) mmdata->mtype = PETSC_MEMTYPE_KOKKOS;
7390 
7391   /* prepare coo coordinates for values insertion */
7392 
7393   /* count total nonzeros of those intermediate seqaij Mats
7394     ncoo_d:    # of nonzeros of matrices that do not have offproc entries
7395     ncoo_o:    # of nonzeros (of matrices that might have offproc entries) that will be inserted to remote procs
7396     ncoo_oown: # of nonzeros (of matrices that might have offproc entries) that will be inserted locally
7397   */
7398   for (cp = 0, ncoo_d = 0, ncoo_o = 0, ncoo_oown = 0; cp < mmdata->cp; cp++) {
7399     Mat_SeqAIJ *mm = (Mat_SeqAIJ *)mp[cp]->data;
7400     if (mptmp[cp]) continue;
7401     if (rmapt[cp] == 2 && hasoffproc) { /* the rows need to be scatter to all processes (might include self) */
7402       const PetscInt *rmap = rmapa[cp];
7403       const PetscInt  mr   = mp[cp]->rmap->n;
7404       const PetscInt  rs   = C->rmap->rstart;
7405       const PetscInt  re   = C->rmap->rend;
7406       const PetscInt *ii   = mm->i;
7407       for (i = 0; i < mr; i++) {
7408         const PetscInt gr = rmap[i];
7409         const PetscInt nz = ii[i + 1] - ii[i];
7410         if (gr < rs || gr >= re) ncoo_o += nz; /* this row is offproc */
7411         else ncoo_oown += nz;                  /* this row is local */
7412       }
7413     } else ncoo_d += mm->nz;
7414   }
7415 
7416   /*
7417     ncoo: total number of nonzeros (including those inserted by remote procs) belonging to this proc
7418 
7419     ncoo = ncoo_d + ncoo_oown + ncoo2, which ncoo2 is number of nonzeros inserted to me by other procs.
7420 
7421     off[0] points to a big index array, which is shared by off[1,2,...]. Similarly, for own[0].
7422 
7423     off[p]: points to the segment for matrix mp[p], storing location of nonzeros that mp[p] will insert to others
7424     own[p]: points to the segment for matrix mp[p], storing location of nonzeros that mp[p] will insert locally
7425     so, off[p+1]-off[p] is the number of nonzeros that mp[p] will send to others.
7426 
7427     coo_i/j/v[]: [ncoo] row/col/val of nonzeros belonging to this proc.
7428     Ex. coo_i[]: the beginning part (of size ncoo_d + ncoo_oown) stores i of local nonzeros, and the remaining part stores i of nonzeros I will receive.
7429   */
7430   PetscCall(PetscCalloc1(mmdata->cp + 1, &mmdata->off)); /* +1 to make a csr-like data structure */
7431   PetscCall(PetscCalloc1(mmdata->cp + 1, &mmdata->own));
7432 
7433   /* gather (i,j) of nonzeros inserted by remote procs */
7434   if (hasoffproc) {
7435     PetscSF  msf;
7436     PetscInt ncoo2, *coo_i2, *coo_j2;
7437 
7438     PetscCall(PetscMalloc1(ncoo_o, &mmdata->off[0]));
7439     PetscCall(PetscMalloc1(ncoo_oown, &mmdata->own[0]));
7440     PetscCall(PetscMalloc2(ncoo_o, &coo_i, ncoo_o, &coo_j)); /* to collect (i,j) of entries to be sent to others */
7441 
7442     for (cp = 0, ncoo_o = 0; cp < mmdata->cp; cp++) {
7443       Mat_SeqAIJ *mm     = (Mat_SeqAIJ *)mp[cp]->data;
7444       PetscInt   *idxoff = mmdata->off[cp];
7445       PetscInt   *idxown = mmdata->own[cp];
7446       if (!mptmp[cp] && rmapt[cp] == 2) { /* row map is sparse */
7447         const PetscInt *rmap = rmapa[cp];
7448         const PetscInt *cmap = cmapa[cp];
7449         const PetscInt *ii   = mm->i;
7450         PetscInt       *coi  = coo_i + ncoo_o;
7451         PetscInt       *coj  = coo_j + ncoo_o;
7452         const PetscInt  mr   = mp[cp]->rmap->n;
7453         const PetscInt  rs   = C->rmap->rstart;
7454         const PetscInt  re   = C->rmap->rend;
7455         const PetscInt  cs   = C->cmap->rstart;
7456         for (i = 0; i < mr; i++) {
7457           const PetscInt *jj = mm->j + ii[i];
7458           const PetscInt  gr = rmap[i];
7459           const PetscInt  nz = ii[i + 1] - ii[i];
7460           if (gr < rs || gr >= re) { /* this is an offproc row */
7461             for (j = ii[i]; j < ii[i + 1]; j++) {
7462               *coi++    = gr;
7463               *idxoff++ = j;
7464             }
7465             if (!cmapt[cp]) { /* already global */
7466               for (j = 0; j < nz; j++) *coj++ = jj[j];
7467             } else if (cmapt[cp] == 1) { /* local to global for owned columns of C */
7468               for (j = 0; j < nz; j++) *coj++ = jj[j] + cs;
7469             } else { /* offdiag */
7470               for (j = 0; j < nz; j++) *coj++ = cmap[jj[j]];
7471             }
7472             ncoo_o += nz;
7473           } else { /* this is a local row */
7474             for (j = ii[i]; j < ii[i + 1]; j++) *idxown++ = j;
7475           }
7476         }
7477       }
7478       mmdata->off[cp + 1] = idxoff;
7479       mmdata->own[cp + 1] = idxown;
7480     }
7481 
7482     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)C), &mmdata->sf));
7483     PetscInt incoo_o;
7484     PetscCall(PetscIntCast(ncoo_o, &incoo_o));
7485     PetscCall(PetscSFSetGraphLayout(mmdata->sf, C->rmap, incoo_o /*nleaves*/, NULL /*ilocal*/, PETSC_OWN_POINTER, coo_i));
7486     PetscCall(PetscSFGetMultiSF(mmdata->sf, &msf));
7487     PetscCall(PetscSFGetGraph(msf, &ncoo2 /*nroots*/, NULL, NULL, NULL));
7488     ncoo = ncoo_d + ncoo_oown + ncoo2;
7489     PetscCall(PetscMalloc2(ncoo, &coo_i2, ncoo, &coo_j2));
7490     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_INT, coo_i, coo_i2 + ncoo_d + ncoo_oown)); /* put (i,j) of remote nonzeros at back */
7491     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_INT, coo_i, coo_i2 + ncoo_d + ncoo_oown));
7492     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_INT, coo_j, coo_j2 + ncoo_d + ncoo_oown));
7493     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_INT, coo_j, coo_j2 + ncoo_d + ncoo_oown));
7494     PetscCall(PetscFree2(coo_i, coo_j));
7495     /* allocate MPI send buffer to collect nonzero values to be sent to remote procs */
7496     PetscCall(PetscSFMalloc(mmdata->sf, mmdata->mtype, ncoo_o * sizeof(PetscScalar), (void **)&mmdata->coo_w));
7497     coo_i = coo_i2;
7498     coo_j = coo_j2;
7499   } else { /* no offproc values insertion */
7500     ncoo = ncoo_d;
7501     PetscCall(PetscMalloc2(ncoo, &coo_i, ncoo, &coo_j));
7502 
7503     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)C), &mmdata->sf));
7504     PetscCall(PetscSFSetGraph(mmdata->sf, 0, 0, NULL, PETSC_OWN_POINTER, NULL, PETSC_OWN_POINTER));
7505     PetscCall(PetscSFSetUp(mmdata->sf));
7506   }
7507   mmdata->hasoffproc = hasoffproc;
7508 
7509   /* gather (i,j) of nonzeros inserted locally */
7510   for (cp = 0, ncoo_d = 0; cp < mmdata->cp; cp++) {
7511     Mat_SeqAIJ     *mm   = (Mat_SeqAIJ *)mp[cp]->data;
7512     PetscInt       *coi  = coo_i + ncoo_d;
7513     PetscInt       *coj  = coo_j + ncoo_d;
7514     const PetscInt *jj   = mm->j;
7515     const PetscInt *ii   = mm->i;
7516     const PetscInt *cmap = cmapa[cp];
7517     const PetscInt *rmap = rmapa[cp];
7518     const PetscInt  mr   = mp[cp]->rmap->n;
7519     const PetscInt  rs   = C->rmap->rstart;
7520     const PetscInt  re   = C->rmap->rend;
7521     const PetscInt  cs   = C->cmap->rstart;
7522 
7523     if (mptmp[cp]) continue;
7524     if (rmapt[cp] == 1) { /* consecutive rows */
7525       /* fill coo_i */
7526       for (i = 0; i < mr; i++) {
7527         const PetscInt gr = i + rs;
7528         for (j = ii[i]; j < ii[i + 1]; j++) coi[j] = gr;
7529       }
7530       /* fill coo_j */
7531       if (!cmapt[cp]) { /* type-0, already global */
7532         PetscCall(PetscArraycpy(coj, jj, mm->nz));
7533       } else if (cmapt[cp] == 1) {                        /* type-1, local to global for consecutive columns of C */
7534         for (j = 0; j < mm->nz; j++) coj[j] = jj[j] + cs; /* lid + col start */
7535       } else {                                            /* type-2, local to global for sparse columns */
7536         for (j = 0; j < mm->nz; j++) coj[j] = cmap[jj[j]];
7537       }
7538       ncoo_d += mm->nz;
7539     } else if (rmapt[cp] == 2) { /* sparse rows */
7540       for (i = 0; i < mr; i++) {
7541         const PetscInt *jj = mm->j + ii[i];
7542         const PetscInt  gr = rmap[i];
7543         const PetscInt  nz = ii[i + 1] - ii[i];
7544         if (gr >= rs && gr < re) { /* local rows */
7545           for (j = ii[i]; j < ii[i + 1]; j++) *coi++ = gr;
7546           if (!cmapt[cp]) { /* type-0, already global */
7547             for (j = 0; j < nz; j++) *coj++ = jj[j];
7548           } else if (cmapt[cp] == 1) { /* local to global for owned columns of C */
7549             for (j = 0; j < nz; j++) *coj++ = jj[j] + cs;
7550           } else { /* type-2, local to global for sparse columns */
7551             for (j = 0; j < nz; j++) *coj++ = cmap[jj[j]];
7552           }
7553           ncoo_d += nz;
7554         }
7555       }
7556     }
7557   }
7558   if (glob) PetscCall(ISRestoreIndices(glob, &globidx));
7559   PetscCall(ISDestroy(&glob));
7560   if (P_oth_l2g) PetscCall(ISLocalToGlobalMappingRestoreIndices(P_oth_l2g, &P_oth_idx));
7561   PetscCall(ISLocalToGlobalMappingDestroy(&P_oth_l2g));
7562   /* allocate an array to store all nonzeros (inserted locally or remotely) belonging to this proc */
7563   PetscCall(PetscSFMalloc(mmdata->sf, mmdata->mtype, ncoo * sizeof(PetscScalar), (void **)&mmdata->coo_v));
7564 
7565   /* set block sizes */
7566   A = product->A;
7567   P = product->B;
7568   switch (ptype) {
7569   case MATPRODUCT_PtAP:
7570     PetscCall(MatSetBlockSizes(C, P->cmap->bs, P->cmap->bs));
7571     break;
7572   case MATPRODUCT_RARt:
7573     PetscCall(MatSetBlockSizes(C, P->rmap->bs, P->rmap->bs));
7574     break;
7575   case MATPRODUCT_ABC:
7576     PetscCall(MatSetBlockSizesFromMats(C, A, product->C));
7577     break;
7578   case MATPRODUCT_AB:
7579     PetscCall(MatSetBlockSizesFromMats(C, A, P));
7580     break;
7581   case MATPRODUCT_AtB:
7582     PetscCall(MatSetBlockSizes(C, A->cmap->bs, P->cmap->bs));
7583     break;
7584   case MATPRODUCT_ABt:
7585     PetscCall(MatSetBlockSizes(C, A->rmap->bs, P->rmap->bs));
7586     break;
7587   default:
7588     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for ProductType %s", MatProductTypes[ptype]);
7589   }
7590 
7591   /* preallocate with COO data */
7592   PetscCall(MatSetPreallocationCOO(C, ncoo, coo_i, coo_j));
7593   PetscCall(PetscFree2(coo_i, coo_j));
7594   PetscFunctionReturn(PETSC_SUCCESS);
7595 }
7596 
7597 PetscErrorCode MatProductSetFromOptions_MPIAIJBACKEND(Mat mat)
7598 {
7599   Mat_Product *product = mat->product;
7600 #if defined(PETSC_HAVE_DEVICE)
7601   PetscBool match  = PETSC_FALSE;
7602   PetscBool usecpu = PETSC_FALSE;
7603 #else
7604   PetscBool match = PETSC_TRUE;
7605 #endif
7606 
7607   PetscFunctionBegin;
7608   MatCheckProduct(mat, 1);
7609 #if defined(PETSC_HAVE_DEVICE)
7610   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, ((PetscObject)product->A)->type_name, &match));
7611   if (match) { /* we can always fallback to the CPU if requested */
7612     switch (product->type) {
7613     case MATPRODUCT_AB:
7614       if (product->api_user) {
7615         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
7616         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
7617         PetscOptionsEnd();
7618       } else {
7619         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
7620         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
7621         PetscOptionsEnd();
7622       }
7623       break;
7624     case MATPRODUCT_AtB:
7625       if (product->api_user) {
7626         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
7627         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
7628         PetscOptionsEnd();
7629       } else {
7630         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
7631         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
7632         PetscOptionsEnd();
7633       }
7634       break;
7635     case MATPRODUCT_PtAP:
7636       if (product->api_user) {
7637         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
7638         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
7639         PetscOptionsEnd();
7640       } else {
7641         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
7642         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
7643         PetscOptionsEnd();
7644       }
7645       break;
7646     default:
7647       break;
7648     }
7649     match = (PetscBool)!usecpu;
7650   }
7651 #endif
7652   if (match) {
7653     switch (product->type) {
7654     case MATPRODUCT_AB:
7655     case MATPRODUCT_AtB:
7656     case MATPRODUCT_PtAP:
7657       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJBACKEND;
7658       break;
7659     default:
7660       break;
7661     }
7662   }
7663   /* fallback to MPIAIJ ops */
7664   if (!mat->ops->productsymbolic) PetscCall(MatProductSetFromOptions_MPIAIJ(mat));
7665   PetscFunctionReturn(PETSC_SUCCESS);
7666 }
7667 
7668 /*
7669    Produces a set of block column indices of the matrix row, one for each block represented in the original row
7670 
7671    n - the number of block indices in cc[]
7672    cc - the block indices (must be large enough to contain the indices)
7673 */
7674 static inline PetscErrorCode MatCollapseRow(Mat Amat, PetscInt row, PetscInt bs, PetscInt *n, PetscInt *cc)
7675 {
7676   PetscInt        cnt = -1, nidx, j;
7677   const PetscInt *idx;
7678 
7679   PetscFunctionBegin;
7680   PetscCall(MatGetRow(Amat, row, &nidx, &idx, NULL));
7681   if (nidx) {
7682     cnt     = 0;
7683     cc[cnt] = idx[0] / bs;
7684     for (j = 1; j < nidx; j++) {
7685       if (cc[cnt] < idx[j] / bs) cc[++cnt] = idx[j] / bs;
7686     }
7687   }
7688   PetscCall(MatRestoreRow(Amat, row, &nidx, &idx, NULL));
7689   *n = cnt + 1;
7690   PetscFunctionReturn(PETSC_SUCCESS);
7691 }
7692 
7693 /*
7694     Produces a set of block column indices of the matrix block row, one for each block represented in the original set of rows
7695 
7696     ncollapsed - the number of block indices
7697     collapsed - the block indices (must be large enough to contain the indices)
7698 */
7699 static inline PetscErrorCode MatCollapseRows(Mat Amat, PetscInt start, PetscInt bs, PetscInt *w0, PetscInt *w1, PetscInt *w2, PetscInt *ncollapsed, PetscInt **collapsed)
7700 {
7701   PetscInt i, nprev, *cprev = w0, ncur = 0, *ccur = w1, *merged = w2, *cprevtmp;
7702 
7703   PetscFunctionBegin;
7704   PetscCall(MatCollapseRow(Amat, start, bs, &nprev, cprev));
7705   for (i = start + 1; i < start + bs; i++) {
7706     PetscCall(MatCollapseRow(Amat, i, bs, &ncur, ccur));
7707     PetscCall(PetscMergeIntArray(nprev, cprev, ncur, ccur, &nprev, &merged));
7708     cprevtmp = cprev;
7709     cprev    = merged;
7710     merged   = cprevtmp;
7711   }
7712   *ncollapsed = nprev;
7713   if (collapsed) *collapsed = cprev;
7714   PetscFunctionReturn(PETSC_SUCCESS);
7715 }
7716 
7717 /*
7718  MatCreateGraph_Simple_AIJ - create simple scalar matrix (graph) from potentially blocked matrix
7719 
7720  Input Parameter:
7721  . Amat - matrix
7722  - symmetrize - make the result symmetric
7723  + scale - scale with diagonal
7724 
7725  Output Parameter:
7726  . a_Gmat - output scalar graph >= 0
7727 
7728 */
7729 PETSC_INTERN PetscErrorCode MatCreateGraph_Simple_AIJ(Mat Amat, PetscBool symmetrize, PetscBool scale, PetscReal filter, PetscInt index_size, PetscInt index[], Mat *a_Gmat)
7730 {
7731   PetscInt  Istart, Iend, Ii, jj, kk, ncols, nloc, NN, MM, bs;
7732   MPI_Comm  comm;
7733   Mat       Gmat;
7734   PetscBool ismpiaij, isseqaij;
7735   Mat       a, b, c;
7736   MatType   jtype;
7737 
7738   PetscFunctionBegin;
7739   PetscCall(PetscObjectGetComm((PetscObject)Amat, &comm));
7740   PetscCall(MatGetOwnershipRange(Amat, &Istart, &Iend));
7741   PetscCall(MatGetSize(Amat, &MM, &NN));
7742   PetscCall(MatGetBlockSize(Amat, &bs));
7743   nloc = (Iend - Istart) / bs;
7744 
7745   PetscCall(PetscObjectBaseTypeCompare((PetscObject)Amat, MATSEQAIJ, &isseqaij));
7746   PetscCall(PetscObjectBaseTypeCompare((PetscObject)Amat, MATMPIAIJ, &ismpiaij));
7747   PetscCheck(isseqaij || ismpiaij, comm, PETSC_ERR_USER, "Require (MPI)AIJ matrix type");
7748 
7749   /* TODO GPU: these calls are potentially expensive if matrices are large and we want to use the GPU */
7750   /* A solution consists in providing a new API, MatAIJGetCollapsedAIJ, and each class can provide a fast
7751      implementation */
7752   if (bs > 1) {
7753     PetscCall(MatGetType(Amat, &jtype));
7754     PetscCall(MatCreate(comm, &Gmat));
7755     PetscCall(MatSetType(Gmat, jtype));
7756     PetscCall(MatSetSizes(Gmat, nloc, nloc, PETSC_DETERMINE, PETSC_DETERMINE));
7757     PetscCall(MatSetBlockSizes(Gmat, 1, 1));
7758     if (isseqaij || ((Mat_MPIAIJ *)Amat->data)->garray) {
7759       PetscInt  *d_nnz, *o_nnz;
7760       MatScalar *aa, val, *AA;
7761       PetscInt  *aj, *ai, *AJ, nc, nmax = 0;
7762 
7763       if (isseqaij) {
7764         a = Amat;
7765         b = NULL;
7766       } else {
7767         Mat_MPIAIJ *d = (Mat_MPIAIJ *)Amat->data;
7768         a             = d->A;
7769         b             = d->B;
7770       }
7771       PetscCall(PetscInfo(Amat, "New bs>1 Graph. nloc=%" PetscInt_FMT "\n", nloc));
7772       PetscCall(PetscMalloc2(nloc, &d_nnz, (isseqaij ? 0 : nloc), &o_nnz));
7773       for (c = a, kk = 0; c && kk < 2; c = b, kk++) {
7774         PetscInt       *nnz = (c == a) ? d_nnz : o_nnz;
7775         const PetscInt *cols1, *cols2;
7776 
7777         for (PetscInt brow = 0, nc1, nc2, ok = 1; brow < nloc * bs; brow += bs) { // block rows
7778           PetscCall(MatGetRow(c, brow, &nc2, &cols2, NULL));
7779           nnz[brow / bs] = nc2 / bs;
7780           if (nc2 % bs) ok = 0;
7781           if (nnz[brow / bs] > nmax) nmax = nnz[brow / bs];
7782           for (PetscInt ii = 1; ii < bs; ii++) { // check for non-dense blocks
7783             PetscCall(MatGetRow(c, brow + ii, &nc1, &cols1, NULL));
7784             if (nc1 != nc2) ok = 0;
7785             else {
7786               for (PetscInt jj = 0; jj < nc1 && ok == 1; jj++) {
7787                 if (cols1[jj] != cols2[jj]) ok = 0;
7788                 if (cols1[jj] % bs != jj % bs) ok = 0;
7789               }
7790             }
7791             PetscCall(MatRestoreRow(c, brow + ii, &nc1, &cols1, NULL));
7792           }
7793           PetscCall(MatRestoreRow(c, brow, &nc2, &cols2, NULL));
7794           if (!ok) {
7795             PetscCall(PetscFree2(d_nnz, o_nnz));
7796             PetscCall(PetscInfo(Amat, "Found sparse blocks - revert to slow method\n"));
7797             goto old_bs;
7798           }
7799         }
7800       }
7801       PetscCall(MatSeqAIJSetPreallocation(Gmat, 0, d_nnz));
7802       PetscCall(MatMPIAIJSetPreallocation(Gmat, 0, d_nnz, 0, o_nnz));
7803       PetscCall(PetscFree2(d_nnz, o_nnz));
7804       PetscCall(PetscMalloc2(nmax, &AA, nmax, &AJ));
7805       // diag
7806       for (PetscInt brow = 0, n, grow; brow < nloc * bs; brow += bs) { // block rows
7807         Mat_SeqAIJ *aseq = (Mat_SeqAIJ *)a->data;
7808 
7809         ai = aseq->i;
7810         n  = ai[brow + 1] - ai[brow];
7811         aj = aseq->j + ai[brow];
7812         for (PetscInt k = 0; k < n; k += bs) {   // block columns
7813           AJ[k / bs] = aj[k] / bs + Istart / bs; // diag starts at (Istart,Istart)
7814           val        = 0;
7815           if (index_size == 0) {
7816             for (PetscInt ii = 0; ii < bs; ii++) { // rows in block
7817               aa = aseq->a + ai[brow + ii] + k;
7818               for (PetscInt jj = 0; jj < bs; jj++) {    // columns in block
7819                 val += PetscAbs(PetscRealPart(aa[jj])); // a sort of norm
7820               }
7821             }
7822           } else {                                            // use (index,index) value if provided
7823             for (PetscInt iii = 0; iii < index_size; iii++) { // rows in block
7824               PetscInt ii = index[iii];
7825               aa          = aseq->a + ai[brow + ii] + k;
7826               for (PetscInt jjj = 0; jjj < index_size; jjj++) { // columns in block
7827                 PetscInt jj = index[jjj];
7828                 val += PetscAbs(PetscRealPart(aa[jj]));
7829               }
7830             }
7831           }
7832           PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs (%" PetscInt_FMT ") >= nmax (%" PetscInt_FMT ")", k / bs, nmax);
7833           AA[k / bs] = val;
7834         }
7835         grow = Istart / bs + brow / bs;
7836         PetscCall(MatSetValues(Gmat, 1, &grow, n / bs, AJ, AA, ADD_VALUES));
7837       }
7838       // off-diag
7839       if (ismpiaij) {
7840         Mat_MPIAIJ        *aij = (Mat_MPIAIJ *)Amat->data;
7841         const PetscScalar *vals;
7842         const PetscInt    *cols, *garray = aij->garray;
7843 
7844         PetscCheck(garray, PETSC_COMM_SELF, PETSC_ERR_USER, "No garray ?");
7845         for (PetscInt brow = 0, grow; brow < nloc * bs; brow += bs) { // block rows
7846           PetscCall(MatGetRow(b, brow, &ncols, &cols, NULL));
7847           for (PetscInt k = 0, cidx = 0; k < ncols; k += bs, cidx++) {
7848             PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs >= nmax");
7849             AA[k / bs] = 0;
7850             AJ[cidx]   = garray[cols[k]] / bs;
7851           }
7852           nc = ncols / bs;
7853           PetscCall(MatRestoreRow(b, brow, &ncols, &cols, NULL));
7854           if (index_size == 0) {
7855             for (PetscInt ii = 0; ii < bs; ii++) { // rows in block
7856               PetscCall(MatGetRow(b, brow + ii, &ncols, &cols, &vals));
7857               for (PetscInt k = 0; k < ncols; k += bs) {
7858                 for (PetscInt jj = 0; jj < bs; jj++) { // cols in block
7859                   PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs (%" PetscInt_FMT ") >= nmax (%" PetscInt_FMT ")", k / bs, nmax);
7860                   AA[k / bs] += PetscAbs(PetscRealPart(vals[k + jj]));
7861                 }
7862               }
7863               PetscCall(MatRestoreRow(b, brow + ii, &ncols, &cols, &vals));
7864             }
7865           } else {                                            // use (index,index) value if provided
7866             for (PetscInt iii = 0; iii < index_size; iii++) { // rows in block
7867               PetscInt ii = index[iii];
7868               PetscCall(MatGetRow(b, brow + ii, &ncols, &cols, &vals));
7869               for (PetscInt k = 0; k < ncols; k += bs) {
7870                 for (PetscInt jjj = 0; jjj < index_size; jjj++) { // cols in block
7871                   PetscInt jj = index[jjj];
7872                   AA[k / bs] += PetscAbs(PetscRealPart(vals[k + jj]));
7873                 }
7874               }
7875               PetscCall(MatRestoreRow(b, brow + ii, &ncols, &cols, &vals));
7876             }
7877           }
7878           grow = Istart / bs + brow / bs;
7879           PetscCall(MatSetValues(Gmat, 1, &grow, nc, AJ, AA, ADD_VALUES));
7880         }
7881       }
7882       PetscCall(MatAssemblyBegin(Gmat, MAT_FINAL_ASSEMBLY));
7883       PetscCall(MatAssemblyEnd(Gmat, MAT_FINAL_ASSEMBLY));
7884       PetscCall(PetscFree2(AA, AJ));
7885     } else {
7886       const PetscScalar *vals;
7887       const PetscInt    *idx;
7888       PetscInt          *d_nnz, *o_nnz, *w0, *w1, *w2;
7889     old_bs:
7890       /*
7891        Determine the preallocation needed for the scalar matrix derived from the vector matrix.
7892        */
7893       PetscCall(PetscInfo(Amat, "OLD bs>1 CreateGraph\n"));
7894       PetscCall(PetscMalloc2(nloc, &d_nnz, (isseqaij ? 0 : nloc), &o_nnz));
7895       if (isseqaij) {
7896         PetscInt max_d_nnz;
7897 
7898         /*
7899          Determine exact preallocation count for (sequential) scalar matrix
7900          */
7901         PetscCall(MatSeqAIJGetMaxRowNonzeros(Amat, &max_d_nnz));
7902         max_d_nnz = PetscMin(nloc, bs * max_d_nnz);
7903         PetscCall(PetscMalloc3(max_d_nnz, &w0, max_d_nnz, &w1, max_d_nnz, &w2));
7904         for (Ii = 0, jj = 0; Ii < Iend; Ii += bs, jj++) PetscCall(MatCollapseRows(Amat, Ii, bs, w0, w1, w2, &d_nnz[jj], NULL));
7905         PetscCall(PetscFree3(w0, w1, w2));
7906       } else if (ismpiaij) {
7907         Mat             Daij, Oaij;
7908         const PetscInt *garray;
7909         PetscInt        max_d_nnz;
7910 
7911         PetscCall(MatMPIAIJGetSeqAIJ(Amat, &Daij, &Oaij, &garray));
7912         /*
7913          Determine exact preallocation count for diagonal block portion of scalar matrix
7914          */
7915         PetscCall(MatSeqAIJGetMaxRowNonzeros(Daij, &max_d_nnz));
7916         max_d_nnz = PetscMin(nloc, bs * max_d_nnz);
7917         PetscCall(PetscMalloc3(max_d_nnz, &w0, max_d_nnz, &w1, max_d_nnz, &w2));
7918         for (Ii = 0, jj = 0; Ii < Iend - Istart; Ii += bs, jj++) PetscCall(MatCollapseRows(Daij, Ii, bs, w0, w1, w2, &d_nnz[jj], NULL));
7919         PetscCall(PetscFree3(w0, w1, w2));
7920         /*
7921          Over estimate (usually grossly over), preallocation count for off-diagonal portion of scalar matrix
7922          */
7923         for (Ii = 0, jj = 0; Ii < Iend - Istart; Ii += bs, jj++) {
7924           o_nnz[jj] = 0;
7925           for (kk = 0; kk < bs; kk++) { /* rows that get collapsed to a single row */
7926             PetscCall(MatGetRow(Oaij, Ii + kk, &ncols, NULL, NULL));
7927             o_nnz[jj] += ncols;
7928             PetscCall(MatRestoreRow(Oaij, Ii + kk, &ncols, NULL, NULL));
7929           }
7930           if (o_nnz[jj] > (NN / bs - nloc)) o_nnz[jj] = NN / bs - nloc;
7931         }
7932       } else SETERRQ(comm, PETSC_ERR_USER, "Require AIJ matrix type");
7933       /* get scalar copy (norms) of matrix */
7934       PetscCall(MatSeqAIJSetPreallocation(Gmat, 0, d_nnz));
7935       PetscCall(MatMPIAIJSetPreallocation(Gmat, 0, d_nnz, 0, o_nnz));
7936       PetscCall(PetscFree2(d_nnz, o_nnz));
7937       for (Ii = Istart; Ii < Iend; Ii++) {
7938         PetscInt dest_row = Ii / bs;
7939 
7940         PetscCall(MatGetRow(Amat, Ii, &ncols, &idx, &vals));
7941         for (jj = 0; jj < ncols; jj++) {
7942           PetscInt    dest_col = idx[jj] / bs;
7943           PetscScalar sv       = PetscAbs(PetscRealPart(vals[jj]));
7944 
7945           PetscCall(MatSetValues(Gmat, 1, &dest_row, 1, &dest_col, &sv, ADD_VALUES));
7946         }
7947         PetscCall(MatRestoreRow(Amat, Ii, &ncols, &idx, &vals));
7948       }
7949       PetscCall(MatAssemblyBegin(Gmat, MAT_FINAL_ASSEMBLY));
7950       PetscCall(MatAssemblyEnd(Gmat, MAT_FINAL_ASSEMBLY));
7951     }
7952   } else {
7953     if (symmetrize || filter >= 0 || scale) PetscCall(MatDuplicate(Amat, MAT_COPY_VALUES, &Gmat));
7954     else {
7955       Gmat = Amat;
7956       PetscCall(PetscObjectReference((PetscObject)Gmat));
7957     }
7958     if (isseqaij) {
7959       a = Gmat;
7960       b = NULL;
7961     } else {
7962       Mat_MPIAIJ *d = (Mat_MPIAIJ *)Gmat->data;
7963       a             = d->A;
7964       b             = d->B;
7965     }
7966     if (filter >= 0 || scale) {
7967       /* take absolute value of each entry */
7968       for (c = a, kk = 0; c && kk < 2; c = b, kk++) {
7969         MatInfo      info;
7970         PetscScalar *avals;
7971 
7972         PetscCall(MatGetInfo(c, MAT_LOCAL, &info));
7973         PetscCall(MatSeqAIJGetArray(c, &avals));
7974         for (int jj = 0; jj < info.nz_used; jj++) avals[jj] = PetscAbsScalar(avals[jj]);
7975         PetscCall(MatSeqAIJRestoreArray(c, &avals));
7976       }
7977     }
7978   }
7979   if (symmetrize) {
7980     PetscBool isset, issym;
7981 
7982     PetscCall(MatIsSymmetricKnown(Amat, &isset, &issym));
7983     if (!isset || !issym) {
7984       Mat matTrans;
7985 
7986       PetscCall(MatTranspose(Gmat, MAT_INITIAL_MATRIX, &matTrans));
7987       PetscCall(MatAXPY(Gmat, 1.0, matTrans, Gmat->structurally_symmetric == PETSC_BOOL3_TRUE ? SAME_NONZERO_PATTERN : DIFFERENT_NONZERO_PATTERN));
7988       PetscCall(MatDestroy(&matTrans));
7989     }
7990     PetscCall(MatSetOption(Gmat, MAT_SYMMETRIC, PETSC_TRUE));
7991   } else if (Amat != Gmat) PetscCall(MatPropagateSymmetryOptions(Amat, Gmat));
7992   if (scale) {
7993     /* scale c for all diagonal values = 1 or -1 */
7994     Vec diag;
7995 
7996     PetscCall(MatCreateVecs(Gmat, &diag, NULL));
7997     PetscCall(MatGetDiagonal(Gmat, diag));
7998     PetscCall(VecReciprocal(diag));
7999     PetscCall(VecSqrtAbs(diag));
8000     PetscCall(MatDiagonalScale(Gmat, diag, diag));
8001     PetscCall(VecDestroy(&diag));
8002   }
8003   PetscCall(MatViewFromOptions(Gmat, NULL, "-mat_graph_view"));
8004   if (filter >= 0) {
8005     PetscCall(MatFilter(Gmat, filter, PETSC_TRUE, PETSC_TRUE));
8006     PetscCall(MatViewFromOptions(Gmat, NULL, "-mat_filter_graph_view"));
8007   }
8008   *a_Gmat = Gmat;
8009   PetscFunctionReturn(PETSC_SUCCESS);
8010 }
8011 
8012 PETSC_INTERN PetscErrorCode MatGetCurrentMemType_MPIAIJ(Mat A, PetscMemType *memtype)
8013 {
8014   Mat_MPIAIJ  *mpiaij = (Mat_MPIAIJ *)A->data;
8015   PetscMemType mD = PETSC_MEMTYPE_HOST, mO = PETSC_MEMTYPE_HOST;
8016 
8017   PetscFunctionBegin;
8018   if (mpiaij->A) PetscCall(MatGetCurrentMemType(mpiaij->A, &mD));
8019   if (mpiaij->B) PetscCall(MatGetCurrentMemType(mpiaij->B, &mO));
8020   *memtype = (mD == mO) ? mD : PETSC_MEMTYPE_HOST;
8021   PetscFunctionReturn(PETSC_SUCCESS);
8022 }
8023 
8024 /*
8025     Special version for direct calls from Fortran
8026 */
8027 
8028 /* Change these macros so can be used in void function */
8029 /* Identical to PetscCallVoid, except it assigns to *_ierr */
8030 #undef PetscCall
8031 #define PetscCall(...) \
8032   do { \
8033     PetscErrorCode ierr_msv_mpiaij = __VA_ARGS__; \
8034     if (PetscUnlikely(ierr_msv_mpiaij)) { \
8035       *_ierr = PetscError(PETSC_COMM_SELF, __LINE__, PETSC_FUNCTION_NAME, __FILE__, ierr_msv_mpiaij, PETSC_ERROR_REPEAT, " "); \
8036       return; \
8037     } \
8038   } while (0)
8039 
8040 #undef SETERRQ
8041 #define SETERRQ(comm, ierr, ...) \
8042   do { \
8043     *_ierr = PetscError(comm, __LINE__, PETSC_FUNCTION_NAME, __FILE__, ierr, PETSC_ERROR_INITIAL, __VA_ARGS__); \
8044     return; \
8045   } while (0)
8046 
8047 #if defined(PETSC_HAVE_FORTRAN_CAPS)
8048   #define matsetvaluesmpiaij_ MATSETVALUESMPIAIJ
8049 #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
8050   #define matsetvaluesmpiaij_ matsetvaluesmpiaij
8051 #else
8052 #endif
8053 PETSC_EXTERN void matsetvaluesmpiaij_(Mat *mmat, PetscInt *mm, const PetscInt im[], PetscInt *mn, const PetscInt in[], const PetscScalar v[], InsertMode *maddv, PetscErrorCode *_ierr)
8054 {
8055   Mat         mat = *mmat;
8056   PetscInt    m = *mm, n = *mn;
8057   InsertMode  addv = *maddv;
8058   Mat_MPIAIJ *aij  = (Mat_MPIAIJ *)mat->data;
8059   PetscScalar value;
8060 
8061   MatCheckPreallocated(mat, 1);
8062   if (mat->insertmode == NOT_SET_VALUES) mat->insertmode = addv;
8063   else PetscCheck(mat->insertmode == addv, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Cannot mix add values and insert values");
8064   {
8065     PetscInt  i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
8066     PetscInt  cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
8067     PetscBool roworiented = aij->roworiented;
8068 
8069     /* Some Variables required in the macro */
8070     Mat         A     = aij->A;
8071     Mat_SeqAIJ *a     = (Mat_SeqAIJ *)A->data;
8072     PetscInt   *aimax = a->imax, *ai = a->i, *ailen = a->ilen, *aj = a->j;
8073     MatScalar  *aa;
8074     PetscBool   ignorezeroentries = ((a->ignorezeroentries && (addv == ADD_VALUES)) ? PETSC_TRUE : PETSC_FALSE);
8075     Mat         B                 = aij->B;
8076     Mat_SeqAIJ *b                 = (Mat_SeqAIJ *)B->data;
8077     PetscInt   *bimax = b->imax, *bi = b->i, *bilen = b->ilen, *bj = b->j, bm = aij->B->rmap->n, am = aij->A->rmap->n;
8078     MatScalar  *ba;
8079     /* This variable below is only for the PETSC_HAVE_VIENNACL or PETSC_HAVE_CUDA cases, but we define it in all cases because we
8080      * cannot use "#if defined" inside a macro. */
8081     PETSC_UNUSED PetscBool inserted = PETSC_FALSE;
8082 
8083     PetscInt  *rp1, *rp2, ii, nrow1, nrow2, _i, rmax1, rmax2, N, low1, high1, low2, high2, t, lastcol1, lastcol2;
8084     PetscInt   nonew = a->nonew;
8085     MatScalar *ap1, *ap2;
8086 
8087     PetscFunctionBegin;
8088     PetscCall(MatSeqAIJGetArray(A, &aa));
8089     PetscCall(MatSeqAIJGetArray(B, &ba));
8090     for (i = 0; i < m; i++) {
8091       if (im[i] < 0) continue;
8092       PetscCheck(im[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, im[i], mat->rmap->N - 1);
8093       if (im[i] >= rstart && im[i] < rend) {
8094         row      = im[i] - rstart;
8095         lastcol1 = -1;
8096         rp1      = aj + ai[row];
8097         ap1      = aa + ai[row];
8098         rmax1    = aimax[row];
8099         nrow1    = ailen[row];
8100         low1     = 0;
8101         high1    = nrow1;
8102         lastcol2 = -1;
8103         rp2      = bj + bi[row];
8104         ap2      = ba + bi[row];
8105         rmax2    = bimax[row];
8106         nrow2    = bilen[row];
8107         low2     = 0;
8108         high2    = nrow2;
8109 
8110         for (j = 0; j < n; j++) {
8111           if (roworiented) value = v[i * n + j];
8112           else value = v[i + j * m];
8113           if (ignorezeroentries && value == 0.0 && (addv == ADD_VALUES) && im[i] != in[j]) continue;
8114           if (in[j] >= cstart && in[j] < cend) {
8115             col = in[j] - cstart;
8116             MatSetValues_SeqAIJ_A_Private(row, col, value, addv, im[i], in[j]);
8117           } else if (in[j] < 0) continue;
8118           else if (PetscUnlikelyDebug(in[j] >= mat->cmap->N)) {
8119             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[j], mat->cmap->N - 1);
8120           } else {
8121             if (mat->was_assembled) {
8122               if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
8123 #if defined(PETSC_USE_CTABLE)
8124               PetscCall(PetscHMapIGetWithDefault(aij->colmap, in[j] + 1, 0, &col));
8125               col--;
8126 #else
8127               col = aij->colmap[in[j]] - 1;
8128 #endif
8129               if (col < 0 && !((Mat_SeqAIJ *)aij->A->data)->nonew) {
8130                 PetscCall(MatDisAssemble_MPIAIJ(mat, PETSC_FALSE));
8131                 col = in[j];
8132                 /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
8133                 B        = aij->B;
8134                 b        = (Mat_SeqAIJ *)B->data;
8135                 bimax    = b->imax;
8136                 bi       = b->i;
8137                 bilen    = b->ilen;
8138                 bj       = b->j;
8139                 rp2      = bj + bi[row];
8140                 ap2      = ba + bi[row];
8141                 rmax2    = bimax[row];
8142                 nrow2    = bilen[row];
8143                 low2     = 0;
8144                 high2    = nrow2;
8145                 bm       = aij->B->rmap->n;
8146                 ba       = b->a;
8147                 inserted = PETSC_FALSE;
8148               }
8149             } else col = in[j];
8150             MatSetValues_SeqAIJ_B_Private(row, col, value, addv, im[i], in[j]);
8151           }
8152         }
8153       } else if (!aij->donotstash) {
8154         if (roworiented) {
8155           PetscCall(MatStashValuesRow_Private(&mat->stash, im[i], n, in, v + i * n, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
8156         } else {
8157           PetscCall(MatStashValuesCol_Private(&mat->stash, im[i], n, in, v + i, m, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
8158         }
8159       }
8160     }
8161     PetscCall(MatSeqAIJRestoreArray(A, &aa));
8162     PetscCall(MatSeqAIJRestoreArray(B, &ba));
8163   }
8164   PetscFunctionReturnVoid();
8165 }
8166 
8167 /* Undefining these here since they were redefined from their original definition above! No
8168  * other PETSc functions should be defined past this point, as it is impossible to recover the
8169  * original definitions */
8170 #undef PetscCall
8171 #undef SETERRQ
8172