xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 914717cb8d19bdacccc7d3b617719035ce4e41b3)
1 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
2 #include <petsc/private/vecimpl.h>
3 #include <petsc/private/sfimpl.h>
4 #include <petsc/private/isimpl.h>
5 #include <petscblaslapack.h>
6 #include <petscsf.h>
7 #include <petsc/private/hashmapi.h>
8 
9 /* defines MatSetValues_MPI_Hash(), MatAssemblyBegin_MPI_Hash(), and MatAssemblyEnd_MPI_Hash() */
10 #define TYPE AIJ
11 #define TYPE_AIJ
12 #include "../src/mat/impls/aij/mpi/mpihashmat.h"
13 #undef TYPE
14 #undef TYPE_AIJ
15 
16 static PetscErrorCode MatReset_MPIAIJ(Mat mat)
17 {
18   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
19 
20   PetscFunctionBegin;
21   PetscCall(PetscLogObjectState((PetscObject)mat, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT, mat->rmap->N, mat->cmap->N));
22   PetscCall(MatStashDestroy_Private(&mat->stash));
23   PetscCall(VecDestroy(&aij->diag));
24   PetscCall(MatDestroy(&aij->A));
25   PetscCall(MatDestroy(&aij->B));
26 #if defined(PETSC_USE_CTABLE)
27   PetscCall(PetscHMapIDestroy(&aij->colmap));
28 #else
29   PetscCall(PetscFree(aij->colmap));
30 #endif
31   PetscCall(PetscFree(aij->garray));
32   PetscCall(VecDestroy(&aij->lvec));
33   PetscCall(VecScatterDestroy(&aij->Mvctx));
34   PetscCall(PetscFree2(aij->rowvalues, aij->rowindices));
35   PetscCall(PetscFree(aij->ld));
36   PetscFunctionReturn(PETSC_SUCCESS);
37 }
38 
39 static PetscErrorCode MatResetHash_MPIAIJ(Mat mat)
40 {
41   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
42   /* Save the nonzero states of the component matrices because those are what are used to determine
43     the nonzero state of mat */
44   PetscObjectState Astate = aij->A->nonzerostate, Bstate = aij->B->nonzerostate;
45 
46   PetscFunctionBegin;
47   PetscCall(MatReset_MPIAIJ(mat));
48   PetscCall(MatSetUp_MPI_Hash(mat));
49   aij->A->nonzerostate = ++Astate, aij->B->nonzerostate = ++Bstate;
50   PetscFunctionReturn(PETSC_SUCCESS);
51 }
52 
53 PetscErrorCode MatDestroy_MPIAIJ(Mat mat)
54 {
55   PetscFunctionBegin;
56   PetscCall(MatReset_MPIAIJ(mat));
57 
58   PetscCall(PetscFree(mat->data));
59 
60   /* may be created by MatCreateMPIAIJSumSeqAIJSymbolic */
61   PetscCall(PetscObjectCompose((PetscObject)mat, "MatMergeSeqsToMPI", NULL));
62 
63   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, NULL));
64   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatStoreValues_C", NULL));
65   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatRetrieveValues_C", NULL));
66   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatIsTranspose_C", NULL));
67   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetPreallocation_C", NULL));
68   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatResetPreallocation_C", NULL));
69   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatResetHash_C", NULL));
70   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetPreallocationCSR_C", NULL));
71   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDiagonalScaleLocal_C", NULL));
72   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpibaij_C", NULL));
73   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpisbaij_C", NULL));
74 #if defined(PETSC_HAVE_CUDA)
75   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijcusparse_C", NULL));
76 #endif
77 #if defined(PETSC_HAVE_HIP)
78   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijhipsparse_C", NULL));
79 #endif
80 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
81   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijkokkos_C", NULL));
82 #endif
83   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpidense_C", NULL));
84 #if defined(PETSC_HAVE_ELEMENTAL)
85   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_elemental_C", NULL));
86 #endif
87 #if defined(PETSC_HAVE_SCALAPACK)
88   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_scalapack_C", NULL));
89 #endif
90 #if defined(PETSC_HAVE_HYPRE)
91   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_hypre_C", NULL));
92   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_transpose_mpiaij_mpiaij_C", NULL));
93 #endif
94   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_is_C", NULL));
95   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_is_mpiaij_C", NULL));
96   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpiaij_C", NULL));
97   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetUseScalableIncreaseOverlap_C", NULL));
98   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijperm_C", NULL));
99   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijsell_C", NULL));
100 #if defined(PETSC_HAVE_MKL_SPARSE)
101   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijmkl_C", NULL));
102 #endif
103   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijcrl_C", NULL));
104   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_is_C", NULL));
105   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpisell_C", NULL));
106   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatSetPreallocationCOO_C", NULL));
107   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatSetValuesCOO_C", NULL));
108   PetscFunctionReturn(PETSC_SUCCESS);
109 }
110 
111 static PetscErrorCode MatGetRowIJ_MPIAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *m, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
112 {
113   Mat B;
114 
115   PetscFunctionBegin;
116   PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, &B));
117   PetscCall(PetscObjectCompose((PetscObject)A, "MatGetRowIJ_MPIAIJ", (PetscObject)B));
118   PetscCall(MatGetRowIJ(B, oshift, symmetric, inodecompressed, m, ia, ja, done));
119   PetscCall(MatDestroy(&B));
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 static PetscErrorCode MatRestoreRowIJ_MPIAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *m, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
124 {
125   Mat B;
126 
127   PetscFunctionBegin;
128   PetscCall(PetscObjectQuery((PetscObject)A, "MatGetRowIJ_MPIAIJ", (PetscObject *)&B));
129   PetscCall(MatRestoreRowIJ(B, oshift, symmetric, inodecompressed, m, ia, ja, done));
130   PetscCall(PetscObjectCompose((PetscObject)A, "MatGetRowIJ_MPIAIJ", NULL));
131   PetscFunctionReturn(PETSC_SUCCESS);
132 }
133 
134 /*MC
135    MATAIJ - MATAIJ = "aij" - A matrix type to be used for sparse matrices.
136 
137    This matrix type is identical to` MATSEQAIJ` when constructed with a single process communicator,
138    and `MATMPIAIJ` otherwise.  As a result, for single process communicators,
139   `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
140   for communicators controlling multiple processes.  It is recommended that you call both of
141   the above preallocation routines for simplicity.
142 
143    Options Database Key:
144 . -mat_type aij - sets the matrix type to `MATAIJ` during a call to `MatSetFromOptions()`
145 
146   Developer Note:
147   Level: beginner
148 
149     Subclasses include `MATAIJCUSPARSE`, `MATAIJPERM`, `MATAIJSELL`, `MATAIJMKL`, `MATAIJCRL`, `MATAIJKOKKOS`,and also automatically switches over to use inodes when
150    enough exist.
151 
152 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MATSEQAIJ`, `MatCreateAIJ()`, `MatCreateSeqAIJ()`, `MATSEQAIJ`, `MATMPIAIJ`
153 M*/
154 
155 /*MC
156    MATAIJCRL - MATAIJCRL = "aijcrl" - A matrix type to be used for sparse matrices.
157 
158    This matrix type is identical to `MATSEQAIJCRL` when constructed with a single process communicator,
159    and `MATMPIAIJCRL` otherwise.  As a result, for single process communicators,
160    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
161   for communicators controlling multiple processes.  It is recommended that you call both of
162   the above preallocation routines for simplicity.
163 
164    Options Database Key:
165 . -mat_type aijcrl - sets the matrix type to `MATMPIAIJCRL` during a call to `MatSetFromOptions()`
166 
167   Level: beginner
168 
169 .seealso: [](ch_matrices), `Mat`, `MatCreateMPIAIJCRL`, `MATSEQAIJCRL`, `MATMPIAIJCRL`, `MATSEQAIJCRL`, `MATMPIAIJCRL`
170 M*/
171 
172 static PetscErrorCode MatBindToCPU_MPIAIJ(Mat A, PetscBool flg)
173 {
174   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
175 
176   PetscFunctionBegin;
177 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP) || defined(PETSC_HAVE_VIENNACL)
178   A->boundtocpu = flg;
179 #endif
180   if (a->A) PetscCall(MatBindToCPU(a->A, flg));
181   if (a->B) PetscCall(MatBindToCPU(a->B, flg));
182 
183   /* In addition to binding the diagonal and off-diagonal matrices, bind the local vectors used for matrix-vector products.
184    * This maybe seems a little odd for a MatBindToCPU() call to do, but it makes no sense for the binding of these vectors
185    * to differ from the parent matrix. */
186   if (a->lvec) PetscCall(VecBindToCPU(a->lvec, flg));
187   if (a->diag) PetscCall(VecBindToCPU(a->diag, flg));
188   PetscFunctionReturn(PETSC_SUCCESS);
189 }
190 
191 static PetscErrorCode MatSetBlockSizes_MPIAIJ(Mat M, PetscInt rbs, PetscInt cbs)
192 {
193   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)M->data;
194 
195   PetscFunctionBegin;
196   if (mat->A) {
197     PetscCall(MatSetBlockSizes(mat->A, rbs, cbs));
198     PetscCall(MatSetBlockSizes(mat->B, rbs, 1));
199   }
200   PetscFunctionReturn(PETSC_SUCCESS);
201 }
202 
203 static PetscErrorCode MatFindNonzeroRows_MPIAIJ(Mat M, IS *keptrows)
204 {
205   Mat_MPIAIJ      *mat = (Mat_MPIAIJ *)M->data;
206   Mat_SeqAIJ      *a   = (Mat_SeqAIJ *)mat->A->data;
207   Mat_SeqAIJ      *b   = (Mat_SeqAIJ *)mat->B->data;
208   const PetscInt  *ia, *ib;
209   const MatScalar *aa, *bb, *aav, *bav;
210   PetscInt         na, nb, i, j, *rows, cnt = 0, n0rows;
211   PetscInt         m = M->rmap->n, rstart = M->rmap->rstart;
212 
213   PetscFunctionBegin;
214   *keptrows = NULL;
215 
216   ia = a->i;
217   ib = b->i;
218   PetscCall(MatSeqAIJGetArrayRead(mat->A, &aav));
219   PetscCall(MatSeqAIJGetArrayRead(mat->B, &bav));
220   for (i = 0; i < m; i++) {
221     na = ia[i + 1] - ia[i];
222     nb = ib[i + 1] - ib[i];
223     if (!na && !nb) {
224       cnt++;
225       goto ok1;
226     }
227     aa = aav + ia[i];
228     for (j = 0; j < na; j++) {
229       if (aa[j] != 0.0) goto ok1;
230     }
231     bb = PetscSafePointerPlusOffset(bav, ib[i]);
232     for (j = 0; j < nb; j++) {
233       if (bb[j] != 0.0) goto ok1;
234     }
235     cnt++;
236   ok1:;
237   }
238   PetscCallMPI(MPIU_Allreduce(&cnt, &n0rows, 1, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)M)));
239   if (!n0rows) {
240     PetscCall(MatSeqAIJRestoreArrayRead(mat->A, &aav));
241     PetscCall(MatSeqAIJRestoreArrayRead(mat->B, &bav));
242     PetscFunctionReturn(PETSC_SUCCESS);
243   }
244   PetscCall(PetscMalloc1(M->rmap->n - cnt, &rows));
245   cnt = 0;
246   for (i = 0; i < m; i++) {
247     na = ia[i + 1] - ia[i];
248     nb = ib[i + 1] - ib[i];
249     if (!na && !nb) continue;
250     aa = aav + ia[i];
251     for (j = 0; j < na; j++) {
252       if (aa[j] != 0.0) {
253         rows[cnt++] = rstart + i;
254         goto ok2;
255       }
256     }
257     bb = PetscSafePointerPlusOffset(bav, ib[i]);
258     for (j = 0; j < nb; j++) {
259       if (bb[j] != 0.0) {
260         rows[cnt++] = rstart + i;
261         goto ok2;
262       }
263     }
264   ok2:;
265   }
266   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)M), cnt, rows, PETSC_OWN_POINTER, keptrows));
267   PetscCall(MatSeqAIJRestoreArrayRead(mat->A, &aav));
268   PetscCall(MatSeqAIJRestoreArrayRead(mat->B, &bav));
269   PetscFunctionReturn(PETSC_SUCCESS);
270 }
271 
272 static PetscErrorCode MatDiagonalSet_MPIAIJ(Mat Y, Vec D, InsertMode is)
273 {
274   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)Y->data;
275   PetscBool   cong;
276 
277   PetscFunctionBegin;
278   PetscCall(MatHasCongruentLayouts(Y, &cong));
279   if (Y->assembled && cong) {
280     PetscCall(MatDiagonalSet(aij->A, D, is));
281   } else {
282     PetscCall(MatDiagonalSet_Default(Y, D, is));
283   }
284   PetscFunctionReturn(PETSC_SUCCESS);
285 }
286 
287 static PetscErrorCode MatFindZeroDiagonals_MPIAIJ(Mat M, IS *zrows)
288 {
289   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)M->data;
290   PetscInt    i, rstart, nrows, *rows;
291 
292   PetscFunctionBegin;
293   *zrows = NULL;
294   PetscCall(MatFindZeroDiagonals_SeqAIJ_Private(aij->A, &nrows, &rows));
295   PetscCall(MatGetOwnershipRange(M, &rstart, NULL));
296   for (i = 0; i < nrows; i++) rows[i] += rstart;
297   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)M), nrows, rows, PETSC_OWN_POINTER, zrows));
298   PetscFunctionReturn(PETSC_SUCCESS);
299 }
300 
301 static PetscErrorCode MatGetColumnReductions_MPIAIJ(Mat A, PetscInt type, PetscReal *reductions)
302 {
303   Mat_MPIAIJ        *aij = (Mat_MPIAIJ *)A->data;
304   PetscInt           i, m, n, *garray = aij->garray;
305   Mat_SeqAIJ        *a_aij = (Mat_SeqAIJ *)aij->A->data;
306   Mat_SeqAIJ        *b_aij = (Mat_SeqAIJ *)aij->B->data;
307   PetscReal         *work;
308   const PetscScalar *dummy;
309   PetscMPIInt        in;
310 
311   PetscFunctionBegin;
312   PetscCall(MatGetSize(A, &m, &n));
313   PetscCall(PetscCalloc1(n, &work));
314   PetscCall(MatSeqAIJGetArrayRead(aij->A, &dummy));
315   PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &dummy));
316   PetscCall(MatSeqAIJGetArrayRead(aij->B, &dummy));
317   PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &dummy));
318   if (type == NORM_2) {
319     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscAbsScalar(a_aij->a[i] * a_aij->a[i]);
320     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscAbsScalar(b_aij->a[i] * b_aij->a[i]);
321   } else if (type == NORM_1) {
322     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscAbsScalar(a_aij->a[i]);
323     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscAbsScalar(b_aij->a[i]);
324   } else if (type == NORM_INFINITY) {
325     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] = PetscMax(PetscAbsScalar(a_aij->a[i]), work[A->cmap->rstart + a_aij->j[i]]);
326     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] = PetscMax(PetscAbsScalar(b_aij->a[i]), work[garray[b_aij->j[i]]]);
327   } else if (type == REDUCTION_SUM_REALPART || type == REDUCTION_MEAN_REALPART) {
328     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscRealPart(a_aij->a[i]);
329     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscRealPart(b_aij->a[i]);
330   } else if (type == REDUCTION_SUM_IMAGINARYPART || type == REDUCTION_MEAN_IMAGINARYPART) {
331     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscImaginaryPart(a_aij->a[i]);
332     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscImaginaryPart(b_aij->a[i]);
333   } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unknown reduction type");
334   PetscCall(PetscMPIIntCast(n, &in));
335   if (type == NORM_INFINITY) {
336     PetscCallMPI(MPIU_Allreduce(work, reductions, in, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)A)));
337   } else {
338     PetscCallMPI(MPIU_Allreduce(work, reductions, in, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)A)));
339   }
340   PetscCall(PetscFree(work));
341   if (type == NORM_2) {
342     for (i = 0; i < n; i++) reductions[i] = PetscSqrtReal(reductions[i]);
343   } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) {
344     for (i = 0; i < n; i++) reductions[i] /= m;
345   }
346   PetscFunctionReturn(PETSC_SUCCESS);
347 }
348 
349 static PetscErrorCode MatFindOffBlockDiagonalEntries_MPIAIJ(Mat A, IS *is)
350 {
351   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
352   IS              sis, gis;
353   const PetscInt *isis, *igis;
354   PetscInt        n, *iis, nsis, ngis, rstart, i;
355 
356   PetscFunctionBegin;
357   PetscCall(MatFindOffBlockDiagonalEntries(a->A, &sis));
358   PetscCall(MatFindNonzeroRows(a->B, &gis));
359   PetscCall(ISGetSize(gis, &ngis));
360   PetscCall(ISGetSize(sis, &nsis));
361   PetscCall(ISGetIndices(sis, &isis));
362   PetscCall(ISGetIndices(gis, &igis));
363 
364   PetscCall(PetscMalloc1(ngis + nsis, &iis));
365   PetscCall(PetscArraycpy(iis, igis, ngis));
366   PetscCall(PetscArraycpy(iis + ngis, isis, nsis));
367   n = ngis + nsis;
368   PetscCall(PetscSortRemoveDupsInt(&n, iis));
369   PetscCall(MatGetOwnershipRange(A, &rstart, NULL));
370   for (i = 0; i < n; i++) iis[i] += rstart;
371   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)A), n, iis, PETSC_OWN_POINTER, is));
372 
373   PetscCall(ISRestoreIndices(sis, &isis));
374   PetscCall(ISRestoreIndices(gis, &igis));
375   PetscCall(ISDestroy(&sis));
376   PetscCall(ISDestroy(&gis));
377   PetscFunctionReturn(PETSC_SUCCESS);
378 }
379 
380 /*
381   Local utility routine that creates a mapping from the global column
382 number to the local number in the off-diagonal part of the local
383 storage of the matrix.  When PETSC_USE_CTABLE is used this is scalable at
384 a slightly higher hash table cost; without it it is not scalable (each processor
385 has an order N integer array but is fast to access.
386 */
387 PetscErrorCode MatCreateColmap_MPIAIJ_Private(Mat mat)
388 {
389   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
390   PetscInt    n   = aij->B->cmap->n, i;
391 
392   PetscFunctionBegin;
393   PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
394 #if defined(PETSC_USE_CTABLE)
395   PetscCall(PetscHMapICreateWithSize(n, &aij->colmap));
396   for (i = 0; i < n; i++) PetscCall(PetscHMapISet(aij->colmap, aij->garray[i] + 1, i + 1));
397 #else
398   PetscCall(PetscCalloc1(mat->cmap->N + 1, &aij->colmap));
399   for (i = 0; i < n; i++) aij->colmap[aij->garray[i]] = i + 1;
400 #endif
401   PetscFunctionReturn(PETSC_SUCCESS);
402 }
403 
404 #define MatSetValues_SeqAIJ_A_Private(row, col, value, addv, orow, ocol) \
405   do { \
406     if (col <= lastcol1) low1 = 0; \
407     else high1 = nrow1; \
408     lastcol1 = col; \
409     while (high1 - low1 > 5) { \
410       t = (low1 + high1) / 2; \
411       if (rp1[t] > col) high1 = t; \
412       else low1 = t; \
413     } \
414     for (_i = low1; _i < high1; _i++) { \
415       if (rp1[_i] > col) break; \
416       if (rp1[_i] == col) { \
417         if (addv == ADD_VALUES) { \
418           ap1[_i] += value; \
419           /* Not sure LogFlops will slow dow the code or not */ \
420           (void)PetscLogFlops(1.0); \
421         } else ap1[_i] = value; \
422         goto a_noinsert; \
423       } \
424     } \
425     if (value == 0.0 && ignorezeroentries && row != col) { \
426       low1  = 0; \
427       high1 = nrow1; \
428       goto a_noinsert; \
429     } \
430     if (nonew == 1) { \
431       low1  = 0; \
432       high1 = nrow1; \
433       goto a_noinsert; \
434     } \
435     PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
436     MatSeqXAIJReallocateAIJ(A, am, 1, nrow1, row, col, rmax1, aa, ai, aj, rp1, ap1, aimax, nonew, MatScalar); \
437     N = nrow1++ - 1; \
438     a->nz++; \
439     high1++; \
440     /* shift up all the later entries in this row */ \
441     PetscCall(PetscArraymove(rp1 + _i + 1, rp1 + _i, N - _i + 1)); \
442     PetscCall(PetscArraymove(ap1 + _i + 1, ap1 + _i, N - _i + 1)); \
443     rp1[_i] = col; \
444     ap1[_i] = value; \
445   a_noinsert:; \
446     ailen[row] = nrow1; \
447   } while (0)
448 
449 #define MatSetValues_SeqAIJ_B_Private(row, col, value, addv, orow, ocol) \
450   do { \
451     if (col <= lastcol2) low2 = 0; \
452     else high2 = nrow2; \
453     lastcol2 = col; \
454     while (high2 - low2 > 5) { \
455       t = (low2 + high2) / 2; \
456       if (rp2[t] > col) high2 = t; \
457       else low2 = t; \
458     } \
459     for (_i = low2; _i < high2; _i++) { \
460       if (rp2[_i] > col) break; \
461       if (rp2[_i] == col) { \
462         if (addv == ADD_VALUES) { \
463           ap2[_i] += value; \
464           (void)PetscLogFlops(1.0); \
465         } else ap2[_i] = value; \
466         goto b_noinsert; \
467       } \
468     } \
469     if (value == 0.0 && ignorezeroentries) { \
470       low2  = 0; \
471       high2 = nrow2; \
472       goto b_noinsert; \
473     } \
474     if (nonew == 1) { \
475       low2  = 0; \
476       high2 = nrow2; \
477       goto b_noinsert; \
478     } \
479     PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
480     MatSeqXAIJReallocateAIJ(B, bm, 1, nrow2, row, col, rmax2, ba, bi, bj, rp2, ap2, bimax, nonew, MatScalar); \
481     N = nrow2++ - 1; \
482     b->nz++; \
483     high2++; \
484     /* shift up all the later entries in this row */ \
485     PetscCall(PetscArraymove(rp2 + _i + 1, rp2 + _i, N - _i + 1)); \
486     PetscCall(PetscArraymove(ap2 + _i + 1, ap2 + _i, N - _i + 1)); \
487     rp2[_i] = col; \
488     ap2[_i] = value; \
489   b_noinsert:; \
490     bilen[row] = nrow2; \
491   } while (0)
492 
493 static PetscErrorCode MatSetValuesRow_MPIAIJ(Mat A, PetscInt row, const PetscScalar v[])
494 {
495   Mat_MPIAIJ  *mat = (Mat_MPIAIJ *)A->data;
496   Mat_SeqAIJ  *a = (Mat_SeqAIJ *)mat->A->data, *b = (Mat_SeqAIJ *)mat->B->data;
497   PetscInt     l, *garray                         = mat->garray, diag;
498   PetscScalar *aa, *ba;
499 
500   PetscFunctionBegin;
501   /* code only works for square matrices A */
502 
503   /* find size of row to the left of the diagonal part */
504   PetscCall(MatGetOwnershipRange(A, &diag, NULL));
505   row = row - diag;
506   for (l = 0; l < b->i[row + 1] - b->i[row]; l++) {
507     if (garray[b->j[b->i[row] + l]] > diag) break;
508   }
509   if (l) {
510     PetscCall(MatSeqAIJGetArray(mat->B, &ba));
511     PetscCall(PetscArraycpy(ba + b->i[row], v, l));
512     PetscCall(MatSeqAIJRestoreArray(mat->B, &ba));
513   }
514 
515   /* diagonal part */
516   if (a->i[row + 1] - a->i[row]) {
517     PetscCall(MatSeqAIJGetArray(mat->A, &aa));
518     PetscCall(PetscArraycpy(aa + a->i[row], v + l, a->i[row + 1] - a->i[row]));
519     PetscCall(MatSeqAIJRestoreArray(mat->A, &aa));
520   }
521 
522   /* right of diagonal part */
523   if (b->i[row + 1] - b->i[row] - l) {
524     PetscCall(MatSeqAIJGetArray(mat->B, &ba));
525     PetscCall(PetscArraycpy(ba + b->i[row] + l, v + l + a->i[row + 1] - a->i[row], b->i[row + 1] - b->i[row] - l));
526     PetscCall(MatSeqAIJRestoreArray(mat->B, &ba));
527   }
528   PetscFunctionReturn(PETSC_SUCCESS);
529 }
530 
531 PetscErrorCode MatSetValues_MPIAIJ(Mat mat, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode addv)
532 {
533   Mat_MPIAIJ *aij   = (Mat_MPIAIJ *)mat->data;
534   PetscScalar value = 0.0;
535   PetscInt    i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
536   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
537   PetscBool   roworiented = aij->roworiented;
538 
539   /* Some Variables required in the macro */
540   Mat         A     = aij->A;
541   Mat_SeqAIJ *a     = (Mat_SeqAIJ *)A->data;
542   PetscInt   *aimax = a->imax, *ai = a->i, *ailen = a->ilen, *aj = a->j;
543   PetscBool   ignorezeroentries = a->ignorezeroentries;
544   Mat         B                 = aij->B;
545   Mat_SeqAIJ *b                 = (Mat_SeqAIJ *)B->data;
546   PetscInt   *bimax = b->imax, *bi = b->i, *bilen = b->ilen, *bj = b->j, bm = aij->B->rmap->n, am = aij->A->rmap->n;
547   MatScalar  *aa, *ba;
548   PetscInt   *rp1, *rp2, ii, nrow1, nrow2, _i, rmax1, rmax2, N, low1, high1, low2, high2, t, lastcol1, lastcol2;
549   PetscInt    nonew;
550   MatScalar  *ap1, *ap2;
551 
552   PetscFunctionBegin;
553   PetscCall(MatSeqAIJGetArray(A, &aa));
554   PetscCall(MatSeqAIJGetArray(B, &ba));
555   for (i = 0; i < m; i++) {
556     if (im[i] < 0) continue;
557     PetscCheck(im[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, im[i], mat->rmap->N - 1);
558     if (im[i] >= rstart && im[i] < rend) {
559       row      = im[i] - rstart;
560       lastcol1 = -1;
561       rp1      = PetscSafePointerPlusOffset(aj, ai[row]);
562       ap1      = PetscSafePointerPlusOffset(aa, ai[row]);
563       rmax1    = aimax[row];
564       nrow1    = ailen[row];
565       low1     = 0;
566       high1    = nrow1;
567       lastcol2 = -1;
568       rp2      = PetscSafePointerPlusOffset(bj, bi[row]);
569       ap2      = PetscSafePointerPlusOffset(ba, bi[row]);
570       rmax2    = bimax[row];
571       nrow2    = bilen[row];
572       low2     = 0;
573       high2    = nrow2;
574 
575       for (j = 0; j < n; j++) {
576         if (v) value = roworiented ? v[i * n + j] : v[i + j * m];
577         if (ignorezeroentries && value == 0.0 && (addv == ADD_VALUES) && im[i] != in[j]) continue;
578         if (in[j] >= cstart && in[j] < cend) {
579           col   = in[j] - cstart;
580           nonew = a->nonew;
581           MatSetValues_SeqAIJ_A_Private(row, col, value, addv, im[i], in[j]);
582         } else if (in[j] < 0) {
583           continue;
584         } else {
585           PetscCheck(in[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[j], mat->cmap->N - 1);
586           if (mat->was_assembled) {
587             if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
588 #if defined(PETSC_USE_CTABLE)
589             PetscCall(PetscHMapIGetWithDefault(aij->colmap, in[j] + 1, 0, &col)); /* map global col ids to local ones */
590             col--;
591 #else
592             col = aij->colmap[in[j]] - 1;
593 #endif
594             if (col < 0 && !((Mat_SeqAIJ *)aij->B->data)->nonew) { /* col < 0 means in[j] is a new col for B */
595               PetscCall(MatDisAssemble_MPIAIJ(mat, PETSC_FALSE));  /* Change aij->B from reduced/local format to expanded/global format */
596               col = in[j];
597               /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
598               B     = aij->B;
599               b     = (Mat_SeqAIJ *)B->data;
600               bimax = b->imax;
601               bi    = b->i;
602               bilen = b->ilen;
603               bj    = b->j;
604               ba    = b->a;
605               rp2   = PetscSafePointerPlusOffset(bj, bi[row]);
606               ap2   = PetscSafePointerPlusOffset(ba, bi[row]);
607               rmax2 = bimax[row];
608               nrow2 = bilen[row];
609               low2  = 0;
610               high2 = nrow2;
611               bm    = aij->B->rmap->n;
612               ba    = b->a;
613             } else if (col < 0 && !(ignorezeroentries && value == 0.0)) {
614               if (1 == ((Mat_SeqAIJ *)aij->B->data)->nonew) {
615                 PetscCall(PetscInfo(mat, "Skipping of insertion of new nonzero location in off-diagonal portion of matrix %g(%" PetscInt_FMT ",%" PetscInt_FMT ")\n", (double)PetscRealPart(value), im[i], in[j]));
616               } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", im[i], in[j]);
617             }
618           } else col = in[j];
619           nonew = b->nonew;
620           MatSetValues_SeqAIJ_B_Private(row, col, value, addv, im[i], in[j]);
621         }
622       }
623     } else {
624       PetscCheck(!mat->nooffprocentries, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Setting off process row %" PetscInt_FMT " even though MatSetOption(,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) was set", im[i]);
625       if (!aij->donotstash) {
626         mat->assembled = PETSC_FALSE;
627         if (roworiented) {
628           PetscCall(MatStashValuesRow_Private(&mat->stash, im[i], n, in, PetscSafePointerPlusOffset(v, i * n), (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
629         } else {
630           PetscCall(MatStashValuesCol_Private(&mat->stash, im[i], n, in, PetscSafePointerPlusOffset(v, i), m, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
631         }
632       }
633     }
634   }
635   PetscCall(MatSeqAIJRestoreArray(A, &aa)); /* aa, bb might have been free'd due to reallocation above. But we don't access them here */
636   PetscCall(MatSeqAIJRestoreArray(B, &ba));
637   PetscFunctionReturn(PETSC_SUCCESS);
638 }
639 
640 /*
641     This function sets the j and ilen arrays (of the diagonal and off-diagonal part) of an MPIAIJ-matrix.
642     The values in mat_i have to be sorted and the values in mat_j have to be sorted for each row (CSR-like).
643     No off-processor parts off the matrix are allowed here and mat->was_assembled has to be PETSC_FALSE.
644 */
645 PetscErrorCode MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(Mat mat, const PetscInt mat_j[], const PetscInt mat_i[])
646 {
647   Mat_MPIAIJ *aij    = (Mat_MPIAIJ *)mat->data;
648   Mat         A      = aij->A; /* diagonal part of the matrix */
649   Mat         B      = aij->B; /* off-diagonal part of the matrix */
650   Mat_SeqAIJ *a      = (Mat_SeqAIJ *)A->data;
651   Mat_SeqAIJ *b      = (Mat_SeqAIJ *)B->data;
652   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, col;
653   PetscInt   *ailen = a->ilen, *aj = a->j;
654   PetscInt   *bilen = b->ilen, *bj = b->j;
655   PetscInt    am          = aij->A->rmap->n, j;
656   PetscInt    diag_so_far = 0, dnz;
657   PetscInt    offd_so_far = 0, onz;
658 
659   PetscFunctionBegin;
660   /* Iterate over all rows of the matrix */
661   for (j = 0; j < am; j++) {
662     dnz = onz = 0;
663     /*  Iterate over all non-zero columns of the current row */
664     for (col = mat_i[j]; col < mat_i[j + 1]; col++) {
665       /* If column is in the diagonal */
666       if (mat_j[col] >= cstart && mat_j[col] < cend) {
667         aj[diag_so_far++] = mat_j[col] - cstart;
668         dnz++;
669       } else { /* off-diagonal entries */
670         bj[offd_so_far++] = mat_j[col];
671         onz++;
672       }
673     }
674     ailen[j] = dnz;
675     bilen[j] = onz;
676   }
677   PetscFunctionReturn(PETSC_SUCCESS);
678 }
679 
680 /*
681     This function sets the local j, a and ilen arrays (of the diagonal and off-diagonal part) of an MPIAIJ-matrix.
682     The values in mat_i have to be sorted and the values in mat_j have to be sorted for each row (CSR-like).
683     No off-processor parts off the matrix are allowed here, they are set at a later point by MatSetValues_MPIAIJ.
684     Also, mat->was_assembled has to be false, otherwise the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart;
685     would not be true and the more complex MatSetValues_MPIAIJ has to be used.
686 */
687 PetscErrorCode MatSetValues_MPIAIJ_CopyFromCSRFormat(Mat mat, const PetscInt mat_j[], const PetscInt mat_i[], const PetscScalar mat_a[])
688 {
689   Mat_MPIAIJ  *aij  = (Mat_MPIAIJ *)mat->data;
690   Mat          A    = aij->A; /* diagonal part of the matrix */
691   Mat          B    = aij->B; /* off-diagonal part of the matrix */
692   Mat_SeqAIJ  *aijd = (Mat_SeqAIJ *)aij->A->data, *aijo = (Mat_SeqAIJ *)aij->B->data;
693   Mat_SeqAIJ  *a      = (Mat_SeqAIJ *)A->data;
694   Mat_SeqAIJ  *b      = (Mat_SeqAIJ *)B->data;
695   PetscInt     cstart = mat->cmap->rstart, cend = mat->cmap->rend;
696   PetscInt    *ailen = a->ilen, *aj = a->j;
697   PetscInt    *bilen = b->ilen, *bj = b->j;
698   PetscInt     am          = aij->A->rmap->n, j;
699   PetscInt    *full_diag_i = aijd->i, *full_offd_i = aijo->i; /* These variables can also include non-local elements, which are set at a later point. */
700   PetscInt     col, dnz_row, onz_row, rowstart_diag, rowstart_offd;
701   PetscScalar *aa = a->a, *ba = b->a;
702 
703   PetscFunctionBegin;
704   /* Iterate over all rows of the matrix */
705   for (j = 0; j < am; j++) {
706     dnz_row = onz_row = 0;
707     rowstart_offd     = full_offd_i[j];
708     rowstart_diag     = full_diag_i[j];
709     /*  Iterate over all non-zero columns of the current row */
710     for (col = mat_i[j]; col < mat_i[j + 1]; col++) {
711       /* If column is in the diagonal */
712       if (mat_j[col] >= cstart && mat_j[col] < cend) {
713         aj[rowstart_diag + dnz_row] = mat_j[col] - cstart;
714         aa[rowstart_diag + dnz_row] = mat_a[col];
715         dnz_row++;
716       } else { /* off-diagonal entries */
717         bj[rowstart_offd + onz_row] = mat_j[col];
718         ba[rowstart_offd + onz_row] = mat_a[col];
719         onz_row++;
720       }
721     }
722     ailen[j] = dnz_row;
723     bilen[j] = onz_row;
724   }
725   PetscFunctionReturn(PETSC_SUCCESS);
726 }
727 
728 static PetscErrorCode MatGetValues_MPIAIJ(Mat mat, PetscInt m, const PetscInt idxm[], PetscInt n, const PetscInt idxn[], PetscScalar v[])
729 {
730   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
731   PetscInt    i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
732   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
733 
734   PetscFunctionBegin;
735   for (i = 0; i < m; i++) {
736     if (idxm[i] < 0) continue; /* negative row */
737     PetscCheck(idxm[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, idxm[i], mat->rmap->N - 1);
738     PetscCheck(idxm[i] >= rstart && idxm[i] < rend, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only local values currently supported, row requested %" PetscInt_FMT " range [%" PetscInt_FMT " %" PetscInt_FMT ")", idxm[i], rstart, rend);
739     row = idxm[i] - rstart;
740     for (j = 0; j < n; j++) {
741       if (idxn[j] < 0) continue; /* negative column */
742       PetscCheck(idxn[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, idxn[j], mat->cmap->N - 1);
743       if (idxn[j] >= cstart && idxn[j] < cend) {
744         col = idxn[j] - cstart;
745         PetscCall(MatGetValues(aij->A, 1, &row, 1, &col, v + i * n + j));
746       } else {
747         if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
748 #if defined(PETSC_USE_CTABLE)
749         PetscCall(PetscHMapIGetWithDefault(aij->colmap, idxn[j] + 1, 0, &col));
750         col--;
751 #else
752         col = aij->colmap[idxn[j]] - 1;
753 #endif
754         if ((col < 0) || (aij->garray[col] != idxn[j])) *(v + i * n + j) = 0.0;
755         else PetscCall(MatGetValues(aij->B, 1, &row, 1, &col, v + i * n + j));
756       }
757     }
758   }
759   PetscFunctionReturn(PETSC_SUCCESS);
760 }
761 
762 static PetscErrorCode MatAssemblyBegin_MPIAIJ(Mat mat, MatAssemblyType mode)
763 {
764   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
765   PetscInt    nstash, reallocs;
766 
767   PetscFunctionBegin;
768   if (aij->donotstash || mat->nooffprocentries) PetscFunctionReturn(PETSC_SUCCESS);
769 
770   PetscCall(MatStashScatterBegin_Private(mat, &mat->stash, mat->rmap->range));
771   PetscCall(MatStashGetInfo_Private(&mat->stash, &nstash, &reallocs));
772   PetscCall(PetscInfo(aij->A, "Stash has %" PetscInt_FMT " entries, uses %" PetscInt_FMT " mallocs.\n", nstash, reallocs));
773   PetscFunctionReturn(PETSC_SUCCESS);
774 }
775 
776 PetscErrorCode MatAssemblyEnd_MPIAIJ(Mat mat, MatAssemblyType mode)
777 {
778   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *)mat->data;
779   PetscMPIInt  n;
780   PetscInt     i, j, rstart, ncols, flg;
781   PetscInt    *row, *col;
782   PetscBool    other_disassembled;
783   PetscScalar *val;
784 
785   /* do not use 'b = (Mat_SeqAIJ*)aij->B->data' as B can be reset in disassembly */
786 
787   PetscFunctionBegin;
788   if (!aij->donotstash && !mat->nooffprocentries) {
789     while (1) {
790       PetscCall(MatStashScatterGetMesg_Private(&mat->stash, &n, &row, &col, &val, &flg));
791       if (!flg) break;
792 
793       for (i = 0; i < n;) {
794         /* Now identify the consecutive vals belonging to the same row */
795         for (j = i, rstart = row[j]; j < n; j++) {
796           if (row[j] != rstart) break;
797         }
798         if (j < n) ncols = j - i;
799         else ncols = n - i;
800         /* Now assemble all these values with a single function call */
801         PetscCall(MatSetValues_MPIAIJ(mat, 1, row + i, ncols, col + i, val + i, mat->insertmode));
802         i = j;
803       }
804     }
805     PetscCall(MatStashScatterEnd_Private(&mat->stash));
806   }
807 #if defined(PETSC_HAVE_DEVICE)
808   if (mat->offloadmask == PETSC_OFFLOAD_CPU) aij->A->offloadmask = PETSC_OFFLOAD_CPU;
809   /* We call MatBindToCPU() on aij->A and aij->B here, because if MatBindToCPU_MPIAIJ() is called before assembly, it cannot bind these. */
810   if (mat->boundtocpu) {
811     PetscCall(MatBindToCPU(aij->A, PETSC_TRUE));
812     PetscCall(MatBindToCPU(aij->B, PETSC_TRUE));
813   }
814 #endif
815   PetscCall(MatAssemblyBegin(aij->A, mode));
816   PetscCall(MatAssemblyEnd(aij->A, mode));
817 
818   /* determine if any processor has disassembled, if so we must
819      also disassemble ourself, in order that we may reassemble. */
820   /*
821      if nonzero structure of submatrix B cannot change then we know that
822      no processor disassembled thus we can skip this stuff
823   */
824   if (!((Mat_SeqAIJ *)aij->B->data)->nonew) {
825     PetscCallMPI(MPIU_Allreduce(&mat->was_assembled, &other_disassembled, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
826     if (mat->was_assembled && !other_disassembled) { /* mat on this rank has reduced off-diag B with local col ids, but globally it does not */
827       PetscCall(MatDisAssemble_MPIAIJ(mat, PETSC_FALSE));
828     }
829   }
830   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) PetscCall(MatSetUpMultiply_MPIAIJ(mat));
831   PetscCall(MatSetOption(aij->B, MAT_USE_INODES, PETSC_FALSE));
832 #if defined(PETSC_HAVE_DEVICE)
833   if (mat->offloadmask == PETSC_OFFLOAD_CPU && aij->B->offloadmask != PETSC_OFFLOAD_UNALLOCATED) aij->B->offloadmask = PETSC_OFFLOAD_CPU;
834 #endif
835   PetscCall(MatAssemblyBegin(aij->B, mode));
836   PetscCall(MatAssemblyEnd(aij->B, mode));
837 
838   PetscCall(PetscFree2(aij->rowvalues, aij->rowindices));
839 
840   aij->rowvalues = NULL;
841 
842   PetscCall(VecDestroy(&aij->diag));
843 
844   /* if no new nonzero locations are allowed in matrix then only set the matrix state the first time through */
845   if ((!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) || !((Mat_SeqAIJ *)aij->A->data)->nonew) {
846     PetscObjectState state = aij->A->nonzerostate + aij->B->nonzerostate;
847     PetscCallMPI(MPIU_Allreduce(&state, &mat->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)mat)));
848   }
849 #if defined(PETSC_HAVE_DEVICE)
850   mat->offloadmask = PETSC_OFFLOAD_BOTH;
851 #endif
852   PetscFunctionReturn(PETSC_SUCCESS);
853 }
854 
855 static PetscErrorCode MatZeroEntries_MPIAIJ(Mat A)
856 {
857   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
858 
859   PetscFunctionBegin;
860   PetscCall(MatZeroEntries(l->A));
861   PetscCall(MatZeroEntries(l->B));
862   PetscFunctionReturn(PETSC_SUCCESS);
863 }
864 
865 static PetscErrorCode MatZeroRows_MPIAIJ(Mat A, PetscInt N, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
866 {
867   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)A->data;
868   PetscInt   *lrows;
869   PetscInt    r, len;
870   PetscBool   cong;
871 
872   PetscFunctionBegin;
873   /* get locally owned rows */
874   PetscCall(MatZeroRowsMapLocal_Private(A, N, rows, &len, &lrows));
875   PetscCall(MatHasCongruentLayouts(A, &cong));
876   /* fix right-hand side if needed */
877   if (x && b) {
878     const PetscScalar *xx;
879     PetscScalar       *bb;
880 
881     PetscCheck(cong, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need matching row/col layout");
882     PetscCall(VecGetArrayRead(x, &xx));
883     PetscCall(VecGetArray(b, &bb));
884     for (r = 0; r < len; ++r) bb[lrows[r]] = diag * xx[lrows[r]];
885     PetscCall(VecRestoreArrayRead(x, &xx));
886     PetscCall(VecRestoreArray(b, &bb));
887   }
888 
889   if (diag != 0.0 && cong) {
890     PetscCall(MatZeroRows(mat->A, len, lrows, diag, NULL, NULL));
891     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
892   } else if (diag != 0.0) { /* non-square or non congruent layouts -> if keepnonzeropattern is false, we allow for new insertion */
893     Mat_SeqAIJ *aijA = (Mat_SeqAIJ *)mat->A->data;
894     Mat_SeqAIJ *aijB = (Mat_SeqAIJ *)mat->B->data;
895     PetscInt    nnwA, nnwB;
896     PetscBool   nnzA, nnzB;
897 
898     nnwA = aijA->nonew;
899     nnwB = aijB->nonew;
900     nnzA = aijA->keepnonzeropattern;
901     nnzB = aijB->keepnonzeropattern;
902     if (!nnzA) {
903       PetscCall(PetscInfo(mat->A, "Requested to not keep the pattern and add a nonzero diagonal; may encounter reallocations on diagonal block.\n"));
904       aijA->nonew = 0;
905     }
906     if (!nnzB) {
907       PetscCall(PetscInfo(mat->B, "Requested to not keep the pattern and add a nonzero diagonal; may encounter reallocations on off-diagonal block.\n"));
908       aijB->nonew = 0;
909     }
910     /* Must zero here before the next loop */
911     PetscCall(MatZeroRows(mat->A, len, lrows, 0.0, NULL, NULL));
912     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
913     for (r = 0; r < len; ++r) {
914       const PetscInt row = lrows[r] + A->rmap->rstart;
915       if (row >= A->cmap->N) continue;
916       PetscCall(MatSetValues(A, 1, &row, 1, &row, &diag, INSERT_VALUES));
917     }
918     aijA->nonew = nnwA;
919     aijB->nonew = nnwB;
920   } else {
921     PetscCall(MatZeroRows(mat->A, len, lrows, 0.0, NULL, NULL));
922     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
923   }
924   PetscCall(PetscFree(lrows));
925   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
926   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
927 
928   /* only change matrix nonzero state if pattern was allowed to be changed */
929   if (!((Mat_SeqAIJ *)mat->A->data)->keepnonzeropattern || !((Mat_SeqAIJ *)mat->A->data)->nonew) {
930     PetscObjectState state = mat->A->nonzerostate + mat->B->nonzerostate;
931     PetscCallMPI(MPIU_Allreduce(&state, &A->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)A)));
932   }
933   PetscFunctionReturn(PETSC_SUCCESS);
934 }
935 
936 static PetscErrorCode MatZeroRowsColumns_MPIAIJ(Mat A, PetscInt N, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
937 {
938   Mat_MPIAIJ        *l = (Mat_MPIAIJ *)A->data;
939   PetscInt           n = A->rmap->n;
940   PetscInt           i, j, r, m, len = 0;
941   PetscInt          *lrows, *owners = A->rmap->range;
942   PetscMPIInt        p = 0;
943   PetscSFNode       *rrows;
944   PetscSF            sf;
945   const PetscScalar *xx;
946   PetscScalar       *bb, *mask, *aij_a;
947   Vec                xmask, lmask;
948   Mat_SeqAIJ        *aij = (Mat_SeqAIJ *)l->B->data;
949   const PetscInt    *aj, *ii, *ridx;
950   PetscScalar       *aa;
951 
952   PetscFunctionBegin;
953   /* Create SF where leaves are input rows and roots are owned rows */
954   PetscCall(PetscMalloc1(n, &lrows));
955   for (r = 0; r < n; ++r) lrows[r] = -1;
956   PetscCall(PetscMalloc1(N, &rrows));
957   for (r = 0; r < N; ++r) {
958     const PetscInt idx = rows[r];
959     PetscCheck(idx >= 0 && A->rmap->N > idx, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range [0,%" PetscInt_FMT ")", idx, A->rmap->N);
960     if (idx < owners[p] || owners[p + 1] <= idx) { /* short-circuit the search if the last p owns this row too */
961       PetscCall(PetscLayoutFindOwner(A->rmap, idx, &p));
962     }
963     rrows[r].rank  = p;
964     rrows[r].index = rows[r] - owners[p];
965   }
966   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
967   PetscCall(PetscSFSetGraph(sf, n, N, NULL, PETSC_OWN_POINTER, rrows, PETSC_OWN_POINTER));
968   /* Collect flags for rows to be zeroed */
969   PetscCall(PetscSFReduceBegin(sf, MPIU_INT, (PetscInt *)rows, lrows, MPI_LOR));
970   PetscCall(PetscSFReduceEnd(sf, MPIU_INT, (PetscInt *)rows, lrows, MPI_LOR));
971   PetscCall(PetscSFDestroy(&sf));
972   /* Compress and put in row numbers */
973   for (r = 0; r < n; ++r)
974     if (lrows[r] >= 0) lrows[len++] = r;
975   /* zero diagonal part of matrix */
976   PetscCall(MatZeroRowsColumns(l->A, len, lrows, diag, x, b));
977   /* handle off-diagonal part of matrix */
978   PetscCall(MatCreateVecs(A, &xmask, NULL));
979   PetscCall(VecDuplicate(l->lvec, &lmask));
980   PetscCall(VecGetArray(xmask, &bb));
981   for (i = 0; i < len; i++) bb[lrows[i]] = 1;
982   PetscCall(VecRestoreArray(xmask, &bb));
983   PetscCall(VecScatterBegin(l->Mvctx, xmask, lmask, ADD_VALUES, SCATTER_FORWARD));
984   PetscCall(VecScatterEnd(l->Mvctx, xmask, lmask, ADD_VALUES, SCATTER_FORWARD));
985   PetscCall(VecDestroy(&xmask));
986   if (x && b) { /* this code is buggy when the row and column layout don't match */
987     PetscBool cong;
988 
989     PetscCall(MatHasCongruentLayouts(A, &cong));
990     PetscCheck(cong, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need matching row/col layout");
991     PetscCall(VecScatterBegin(l->Mvctx, x, l->lvec, INSERT_VALUES, SCATTER_FORWARD));
992     PetscCall(VecScatterEnd(l->Mvctx, x, l->lvec, INSERT_VALUES, SCATTER_FORWARD));
993     PetscCall(VecGetArrayRead(l->lvec, &xx));
994     PetscCall(VecGetArray(b, &bb));
995   }
996   PetscCall(VecGetArray(lmask, &mask));
997   /* remove zeroed rows of off-diagonal matrix */
998   PetscCall(MatSeqAIJGetArray(l->B, &aij_a));
999   ii = aij->i;
1000   for (i = 0; i < len; i++) PetscCall(PetscArrayzero(PetscSafePointerPlusOffset(aij_a, ii[lrows[i]]), ii[lrows[i] + 1] - ii[lrows[i]]));
1001   /* loop over all elements of off process part of matrix zeroing removed columns*/
1002   if (aij->compressedrow.use) {
1003     m    = aij->compressedrow.nrows;
1004     ii   = aij->compressedrow.i;
1005     ridx = aij->compressedrow.rindex;
1006     for (i = 0; i < m; i++) {
1007       n  = ii[i + 1] - ii[i];
1008       aj = aij->j + ii[i];
1009       aa = aij_a + ii[i];
1010 
1011       for (j = 0; j < n; j++) {
1012         if (PetscAbsScalar(mask[*aj])) {
1013           if (b) bb[*ridx] -= *aa * xx[*aj];
1014           *aa = 0.0;
1015         }
1016         aa++;
1017         aj++;
1018       }
1019       ridx++;
1020     }
1021   } else { /* do not use compressed row format */
1022     m = l->B->rmap->n;
1023     for (i = 0; i < m; i++) {
1024       n  = ii[i + 1] - ii[i];
1025       aj = aij->j + ii[i];
1026       aa = aij_a + ii[i];
1027       for (j = 0; j < n; j++) {
1028         if (PetscAbsScalar(mask[*aj])) {
1029           if (b) bb[i] -= *aa * xx[*aj];
1030           *aa = 0.0;
1031         }
1032         aa++;
1033         aj++;
1034       }
1035     }
1036   }
1037   if (x && b) {
1038     PetscCall(VecRestoreArray(b, &bb));
1039     PetscCall(VecRestoreArrayRead(l->lvec, &xx));
1040   }
1041   PetscCall(MatSeqAIJRestoreArray(l->B, &aij_a));
1042   PetscCall(VecRestoreArray(lmask, &mask));
1043   PetscCall(VecDestroy(&lmask));
1044   PetscCall(PetscFree(lrows));
1045 
1046   /* only change matrix nonzero state if pattern was allowed to be changed */
1047   if (!((Mat_SeqAIJ *)l->A->data)->nonew) {
1048     PetscObjectState state = l->A->nonzerostate + l->B->nonzerostate;
1049     PetscCallMPI(MPIU_Allreduce(&state, &A->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)A)));
1050   }
1051   PetscFunctionReturn(PETSC_SUCCESS);
1052 }
1053 
1054 static PetscErrorCode MatMult_MPIAIJ(Mat A, Vec xx, Vec yy)
1055 {
1056   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1057   PetscInt    nt;
1058   VecScatter  Mvctx = a->Mvctx;
1059 
1060   PetscFunctionBegin;
1061   PetscCall(VecGetLocalSize(xx, &nt));
1062   PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt);
1063   PetscCall(VecScatterBegin(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1064   PetscUseTypeMethod(a->A, mult, xx, yy);
1065   PetscCall(VecScatterEnd(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1066   PetscUseTypeMethod(a->B, multadd, a->lvec, yy, yy);
1067   PetscFunctionReturn(PETSC_SUCCESS);
1068 }
1069 
1070 static PetscErrorCode MatMultDiagonalBlock_MPIAIJ(Mat A, Vec bb, Vec xx)
1071 {
1072   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1073 
1074   PetscFunctionBegin;
1075   PetscCall(MatMultDiagonalBlock(a->A, bb, xx));
1076   PetscFunctionReturn(PETSC_SUCCESS);
1077 }
1078 
1079 static PetscErrorCode MatMultAdd_MPIAIJ(Mat A, Vec xx, Vec yy, Vec zz)
1080 {
1081   Mat_MPIAIJ *a     = (Mat_MPIAIJ *)A->data;
1082   VecScatter  Mvctx = a->Mvctx;
1083 
1084   PetscFunctionBegin;
1085   PetscCall(VecScatterBegin(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1086   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
1087   PetscCall(VecScatterEnd(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1088   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
1089   PetscFunctionReturn(PETSC_SUCCESS);
1090 }
1091 
1092 static PetscErrorCode MatMultTranspose_MPIAIJ(Mat A, Vec xx, Vec yy)
1093 {
1094   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1095 
1096   PetscFunctionBegin;
1097   /* do nondiagonal part */
1098   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
1099   /* do local part */
1100   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
1101   /* add partial results together */
1102   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
1103   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
1104   PetscFunctionReturn(PETSC_SUCCESS);
1105 }
1106 
1107 static PetscErrorCode MatIsTranspose_MPIAIJ(Mat Amat, Mat Bmat, PetscReal tol, PetscBool *f)
1108 {
1109   MPI_Comm    comm;
1110   Mat_MPIAIJ *Aij = (Mat_MPIAIJ *)Amat->data, *Bij = (Mat_MPIAIJ *)Bmat->data;
1111   Mat         Adia = Aij->A, Bdia = Bij->A, Aoff, Boff, *Aoffs, *Boffs;
1112   IS          Me, Notme;
1113   PetscInt    M, N, first, last, *notme, i;
1114   PetscBool   lf;
1115   PetscMPIInt size;
1116 
1117   PetscFunctionBegin;
1118   /* Easy test: symmetric diagonal block */
1119   PetscCall(MatIsTranspose(Adia, Bdia, tol, &lf));
1120   PetscCallMPI(MPIU_Allreduce(&lf, f, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)Amat)));
1121   if (!*f) PetscFunctionReturn(PETSC_SUCCESS);
1122   PetscCall(PetscObjectGetComm((PetscObject)Amat, &comm));
1123   PetscCallMPI(MPI_Comm_size(comm, &size));
1124   if (size == 1) PetscFunctionReturn(PETSC_SUCCESS);
1125 
1126   /* Hard test: off-diagonal block. This takes a MatCreateSubMatrix. */
1127   PetscCall(MatGetSize(Amat, &M, &N));
1128   PetscCall(MatGetOwnershipRange(Amat, &first, &last));
1129   PetscCall(PetscMalloc1(N - last + first, &notme));
1130   for (i = 0; i < first; i++) notme[i] = i;
1131   for (i = last; i < M; i++) notme[i - last + first] = i;
1132   PetscCall(ISCreateGeneral(MPI_COMM_SELF, N - last + first, notme, PETSC_COPY_VALUES, &Notme));
1133   PetscCall(ISCreateStride(MPI_COMM_SELF, last - first, first, 1, &Me));
1134   PetscCall(MatCreateSubMatrices(Amat, 1, &Me, &Notme, MAT_INITIAL_MATRIX, &Aoffs));
1135   Aoff = Aoffs[0];
1136   PetscCall(MatCreateSubMatrices(Bmat, 1, &Notme, &Me, MAT_INITIAL_MATRIX, &Boffs));
1137   Boff = Boffs[0];
1138   PetscCall(MatIsTranspose(Aoff, Boff, tol, f));
1139   PetscCall(MatDestroyMatrices(1, &Aoffs));
1140   PetscCall(MatDestroyMatrices(1, &Boffs));
1141   PetscCall(ISDestroy(&Me));
1142   PetscCall(ISDestroy(&Notme));
1143   PetscCall(PetscFree(notme));
1144   PetscFunctionReturn(PETSC_SUCCESS);
1145 }
1146 
1147 static PetscErrorCode MatMultTransposeAdd_MPIAIJ(Mat A, Vec xx, Vec yy, Vec zz)
1148 {
1149   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1150 
1151   PetscFunctionBegin;
1152   /* do nondiagonal part */
1153   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
1154   /* do local part */
1155   PetscCall((*a->A->ops->multtransposeadd)(a->A, xx, yy, zz));
1156   /* add partial results together */
1157   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, zz, ADD_VALUES, SCATTER_REVERSE));
1158   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, zz, ADD_VALUES, SCATTER_REVERSE));
1159   PetscFunctionReturn(PETSC_SUCCESS);
1160 }
1161 
1162 /*
1163   This only works correctly for square matrices where the subblock A->A is the
1164    diagonal block
1165 */
1166 static PetscErrorCode MatGetDiagonal_MPIAIJ(Mat A, Vec v)
1167 {
1168   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1169 
1170   PetscFunctionBegin;
1171   PetscCheck(A->rmap->N == A->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Supports only square matrix where A->A is diag block");
1172   PetscCheck(A->rmap->rstart == A->cmap->rstart && A->rmap->rend == A->cmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "row partition must equal col partition");
1173   PetscCall(MatGetDiagonal(a->A, v));
1174   PetscFunctionReturn(PETSC_SUCCESS);
1175 }
1176 
1177 static PetscErrorCode MatScale_MPIAIJ(Mat A, PetscScalar aa)
1178 {
1179   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1180 
1181   PetscFunctionBegin;
1182   PetscCall(MatScale(a->A, aa));
1183   PetscCall(MatScale(a->B, aa));
1184   PetscFunctionReturn(PETSC_SUCCESS);
1185 }
1186 
1187 static PetscErrorCode MatView_MPIAIJ_Binary(Mat mat, PetscViewer viewer)
1188 {
1189   Mat_MPIAIJ        *aij    = (Mat_MPIAIJ *)mat->data;
1190   Mat_SeqAIJ        *A      = (Mat_SeqAIJ *)aij->A->data;
1191   Mat_SeqAIJ        *B      = (Mat_SeqAIJ *)aij->B->data;
1192   const PetscInt    *garray = aij->garray;
1193   const PetscScalar *aa, *ba;
1194   PetscInt           header[4], M, N, m, rs, cs, cnt, i, ja, jb;
1195   PetscInt64         nz, hnz;
1196   PetscInt          *rowlens;
1197   PetscInt          *colidxs;
1198   PetscScalar       *matvals;
1199   PetscMPIInt        rank;
1200 
1201   PetscFunctionBegin;
1202   PetscCall(PetscViewerSetUp(viewer));
1203 
1204   M  = mat->rmap->N;
1205   N  = mat->cmap->N;
1206   m  = mat->rmap->n;
1207   rs = mat->rmap->rstart;
1208   cs = mat->cmap->rstart;
1209   nz = A->nz + B->nz;
1210 
1211   /* write matrix header */
1212   header[0] = MAT_FILE_CLASSID;
1213   header[1] = M;
1214   header[2] = N;
1215   PetscCallMPI(MPI_Reduce(&nz, &hnz, 1, MPIU_INT64, MPI_SUM, 0, PetscObjectComm((PetscObject)mat)));
1216   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
1217   if (rank == 0) PetscCall(PetscIntCast(hnz, &header[3]));
1218   PetscCall(PetscViewerBinaryWrite(viewer, header, 4, PETSC_INT));
1219 
1220   /* fill in and store row lengths  */
1221   PetscCall(PetscMalloc1(m, &rowlens));
1222   for (i = 0; i < m; i++) rowlens[i] = A->i[i + 1] - A->i[i] + B->i[i + 1] - B->i[i];
1223   PetscCall(PetscViewerBinaryWriteAll(viewer, rowlens, m, rs, M, PETSC_INT));
1224   PetscCall(PetscFree(rowlens));
1225 
1226   /* fill in and store column indices */
1227   PetscCall(PetscMalloc1(nz, &colidxs));
1228   for (cnt = 0, i = 0; i < m; i++) {
1229     for (jb = B->i[i]; jb < B->i[i + 1]; jb++) {
1230       if (garray[B->j[jb]] > cs) break;
1231       colidxs[cnt++] = garray[B->j[jb]];
1232     }
1233     for (ja = A->i[i]; ja < A->i[i + 1]; ja++) colidxs[cnt++] = A->j[ja] + cs;
1234     for (; jb < B->i[i + 1]; jb++) colidxs[cnt++] = garray[B->j[jb]];
1235   }
1236   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt64_FMT, cnt, nz);
1237   PetscCall(PetscViewerBinaryWriteAll(viewer, colidxs, nz, PETSC_DETERMINE, PETSC_DETERMINE, PETSC_INT));
1238   PetscCall(PetscFree(colidxs));
1239 
1240   /* fill in and store nonzero values */
1241   PetscCall(MatSeqAIJGetArrayRead(aij->A, &aa));
1242   PetscCall(MatSeqAIJGetArrayRead(aij->B, &ba));
1243   PetscCall(PetscMalloc1(nz, &matvals));
1244   for (cnt = 0, i = 0; i < m; i++) {
1245     for (jb = B->i[i]; jb < B->i[i + 1]; jb++) {
1246       if (garray[B->j[jb]] > cs) break;
1247       matvals[cnt++] = ba[jb];
1248     }
1249     for (ja = A->i[i]; ja < A->i[i + 1]; ja++) matvals[cnt++] = aa[ja];
1250     for (; jb < B->i[i + 1]; jb++) matvals[cnt++] = ba[jb];
1251   }
1252   PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &aa));
1253   PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &ba));
1254   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_LIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt64_FMT, cnt, nz);
1255   PetscCall(PetscViewerBinaryWriteAll(viewer, matvals, nz, PETSC_DETERMINE, PETSC_DETERMINE, PETSC_SCALAR));
1256   PetscCall(PetscFree(matvals));
1257 
1258   /* write block size option to the viewer's .info file */
1259   PetscCall(MatView_Binary_BlockSizes(mat, viewer));
1260   PetscFunctionReturn(PETSC_SUCCESS);
1261 }
1262 
1263 #include <petscdraw.h>
1264 static PetscErrorCode MatView_MPIAIJ_ASCIIorDraworSocket(Mat mat, PetscViewer viewer)
1265 {
1266   Mat_MPIAIJ       *aij  = (Mat_MPIAIJ *)mat->data;
1267   PetscMPIInt       rank = aij->rank, size = aij->size;
1268   PetscBool         isdraw, iascii, isbinary;
1269   PetscViewer       sviewer;
1270   PetscViewerFormat format;
1271 
1272   PetscFunctionBegin;
1273   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1274   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1275   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1276   if (iascii) {
1277     PetscCall(PetscViewerGetFormat(viewer, &format));
1278     if (format == PETSC_VIEWER_LOAD_BALANCE) {
1279       PetscInt i, nmax = 0, nmin = PETSC_INT_MAX, navg = 0, *nz, nzlocal = ((Mat_SeqAIJ *)aij->A->data)->nz + ((Mat_SeqAIJ *)aij->B->data)->nz;
1280       PetscCall(PetscMalloc1(size, &nz));
1281       PetscCallMPI(MPI_Allgather(&nzlocal, 1, MPIU_INT, nz, 1, MPIU_INT, PetscObjectComm((PetscObject)mat)));
1282       for (i = 0; i < size; i++) {
1283         nmax = PetscMax(nmax, nz[i]);
1284         nmin = PetscMin(nmin, nz[i]);
1285         navg += nz[i];
1286       }
1287       PetscCall(PetscFree(nz));
1288       navg = navg / size;
1289       PetscCall(PetscViewerASCIIPrintf(viewer, "Load Balance - Nonzeros: Min %" PetscInt_FMT "  avg %" PetscInt_FMT "  max %" PetscInt_FMT "\n", nmin, navg, nmax));
1290       PetscFunctionReturn(PETSC_SUCCESS);
1291     }
1292     PetscCall(PetscViewerGetFormat(viewer, &format));
1293     if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
1294       MatInfo   info;
1295       PetscInt *inodes = NULL;
1296 
1297       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
1298       PetscCall(MatGetInfo(mat, MAT_LOCAL, &info));
1299       PetscCall(MatInodeGetInodeSizes(aij->A, NULL, &inodes, NULL));
1300       PetscCall(PetscViewerASCIIPushSynchronized(viewer));
1301       if (!inodes) {
1302         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %g, not using I-node routines\n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated,
1303                                                      info.memory));
1304       } else {
1305         PetscCall(
1306           PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %g, using I-node routines\n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated, info.memory));
1307       }
1308       PetscCall(MatGetInfo(aij->A, MAT_LOCAL, &info));
1309       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] on-diagonal part: nz %" PetscInt_FMT " \n", rank, (PetscInt)info.nz_used));
1310       PetscCall(MatGetInfo(aij->B, MAT_LOCAL, &info));
1311       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] off-diagonal part: nz %" PetscInt_FMT " \n", rank, (PetscInt)info.nz_used));
1312       PetscCall(PetscViewerFlush(viewer));
1313       PetscCall(PetscViewerASCIIPopSynchronized(viewer));
1314       PetscCall(PetscViewerASCIIPrintf(viewer, "Information on VecScatter used in matrix-vector product: \n"));
1315       PetscCall(VecScatterView(aij->Mvctx, viewer));
1316       PetscFunctionReturn(PETSC_SUCCESS);
1317     } else if (format == PETSC_VIEWER_ASCII_INFO) {
1318       PetscInt inodecount, inodelimit, *inodes;
1319       PetscCall(MatInodeGetInodeSizes(aij->A, &inodecount, &inodes, &inodelimit));
1320       if (inodes) {
1321         PetscCall(PetscViewerASCIIPrintf(viewer, "using I-node (on process 0) routines: found %" PetscInt_FMT " nodes, limit used is %" PetscInt_FMT "\n", inodecount, inodelimit));
1322       } else {
1323         PetscCall(PetscViewerASCIIPrintf(viewer, "not using I-node (on process 0) routines\n"));
1324       }
1325       PetscFunctionReturn(PETSC_SUCCESS);
1326     } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO) {
1327       PetscFunctionReturn(PETSC_SUCCESS);
1328     }
1329   } else if (isbinary) {
1330     if (size == 1) {
1331       PetscCall(PetscObjectSetName((PetscObject)aij->A, ((PetscObject)mat)->name));
1332       PetscCall(MatView(aij->A, viewer));
1333     } else {
1334       PetscCall(MatView_MPIAIJ_Binary(mat, viewer));
1335     }
1336     PetscFunctionReturn(PETSC_SUCCESS);
1337   } else if (iascii && size == 1) {
1338     PetscCall(PetscObjectSetName((PetscObject)aij->A, ((PetscObject)mat)->name));
1339     PetscCall(MatView(aij->A, viewer));
1340     PetscFunctionReturn(PETSC_SUCCESS);
1341   } else if (isdraw) {
1342     PetscDraw draw;
1343     PetscBool isnull;
1344     PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1345     PetscCall(PetscDrawIsNull(draw, &isnull));
1346     if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1347   }
1348 
1349   { /* assemble the entire matrix onto first processor */
1350     Mat A = NULL, Av;
1351     IS  isrow, iscol;
1352 
1353     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat), rank == 0 ? mat->rmap->N : 0, 0, 1, &isrow));
1354     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat), rank == 0 ? mat->cmap->N : 0, 0, 1, &iscol));
1355     PetscCall(MatCreateSubMatrix(mat, isrow, iscol, MAT_INITIAL_MATRIX, &A));
1356     PetscCall(MatMPIAIJGetSeqAIJ(A, &Av, NULL, NULL));
1357     /*  The commented code uses MatCreateSubMatrices instead */
1358     /*
1359     Mat *AA, A = NULL, Av;
1360     IS  isrow,iscol;
1361 
1362     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat),rank == 0 ? mat->rmap->N : 0,0,1,&isrow));
1363     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat),rank == 0 ? mat->cmap->N : 0,0,1,&iscol));
1364     PetscCall(MatCreateSubMatrices(mat,1,&isrow,&iscol,MAT_INITIAL_MATRIX,&AA));
1365     if (rank == 0) {
1366        PetscCall(PetscObjectReference((PetscObject)AA[0]));
1367        A    = AA[0];
1368        Av   = AA[0];
1369     }
1370     PetscCall(MatDestroySubMatrices(1,&AA));
1371 */
1372     PetscCall(ISDestroy(&iscol));
1373     PetscCall(ISDestroy(&isrow));
1374     /*
1375        Everyone has to call to draw the matrix since the graphics waits are
1376        synchronized across all processors that share the PetscDraw object
1377     */
1378     PetscCall(PetscViewerGetSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
1379     if (rank == 0) {
1380       if (((PetscObject)mat)->name) PetscCall(PetscObjectSetName((PetscObject)Av, ((PetscObject)mat)->name));
1381       PetscCall(MatView_SeqAIJ(Av, sviewer));
1382     }
1383     PetscCall(PetscViewerRestoreSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
1384     PetscCall(MatDestroy(&A));
1385   }
1386   PetscFunctionReturn(PETSC_SUCCESS);
1387 }
1388 
1389 PetscErrorCode MatView_MPIAIJ(Mat mat, PetscViewer viewer)
1390 {
1391   PetscBool iascii, isdraw, issocket, isbinary;
1392 
1393   PetscFunctionBegin;
1394   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1395   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1396   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1397   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERSOCKET, &issocket));
1398   if (iascii || isdraw || isbinary || issocket) PetscCall(MatView_MPIAIJ_ASCIIorDraworSocket(mat, viewer));
1399   PetscFunctionReturn(PETSC_SUCCESS);
1400 }
1401 
1402 static PetscErrorCode MatSOR_MPIAIJ(Mat matin, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1403 {
1404   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)matin->data;
1405   Vec         bb1 = NULL;
1406   PetscBool   hasop;
1407 
1408   PetscFunctionBegin;
1409   if (flag == SOR_APPLY_UPPER) {
1410     PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1411     PetscFunctionReturn(PETSC_SUCCESS);
1412   }
1413 
1414   if (its > 1 || ~flag & SOR_ZERO_INITIAL_GUESS || flag & SOR_EISENSTAT) PetscCall(VecDuplicate(bb, &bb1));
1415 
1416   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP) {
1417     if (flag & SOR_ZERO_INITIAL_GUESS) {
1418       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1419       its--;
1420     }
1421 
1422     while (its--) {
1423       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1424       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1425 
1426       /* update rhs: bb1 = bb - B*x */
1427       PetscCall(VecScale(mat->lvec, -1.0));
1428       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1429 
1430       /* local sweep */
1431       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_SYMMETRIC_SWEEP, fshift, lits, 1, xx));
1432     }
1433   } else if (flag & SOR_LOCAL_FORWARD_SWEEP) {
1434     if (flag & SOR_ZERO_INITIAL_GUESS) {
1435       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1436       its--;
1437     }
1438     while (its--) {
1439       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1440       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1441 
1442       /* update rhs: bb1 = bb - B*x */
1443       PetscCall(VecScale(mat->lvec, -1.0));
1444       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1445 
1446       /* local sweep */
1447       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_FORWARD_SWEEP, fshift, lits, 1, xx));
1448     }
1449   } else if (flag & SOR_LOCAL_BACKWARD_SWEEP) {
1450     if (flag & SOR_ZERO_INITIAL_GUESS) {
1451       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1452       its--;
1453     }
1454     while (its--) {
1455       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1456       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1457 
1458       /* update rhs: bb1 = bb - B*x */
1459       PetscCall(VecScale(mat->lvec, -1.0));
1460       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1461 
1462       /* local sweep */
1463       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_BACKWARD_SWEEP, fshift, lits, 1, xx));
1464     }
1465   } else if (flag & SOR_EISENSTAT) {
1466     Vec xx1;
1467 
1468     PetscCall(VecDuplicate(bb, &xx1));
1469     PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, (MatSORType)(SOR_ZERO_INITIAL_GUESS | SOR_LOCAL_BACKWARD_SWEEP), fshift, lits, 1, xx));
1470 
1471     PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1472     PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1473     if (!mat->diag) {
1474       PetscCall(MatCreateVecs(matin, &mat->diag, NULL));
1475       PetscCall(MatGetDiagonal(matin, mat->diag));
1476     }
1477     PetscCall(MatHasOperation(matin, MATOP_MULT_DIAGONAL_BLOCK, &hasop));
1478     if (hasop) {
1479       PetscCall(MatMultDiagonalBlock(matin, xx, bb1));
1480     } else {
1481       PetscCall(VecPointwiseMult(bb1, mat->diag, xx));
1482     }
1483     PetscCall(VecAYPX(bb1, (omega - 2.0) / omega, bb));
1484 
1485     PetscCall(MatMultAdd(mat->B, mat->lvec, bb1, bb1));
1486 
1487     /* local sweep */
1488     PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, (MatSORType)(SOR_ZERO_INITIAL_GUESS | SOR_LOCAL_FORWARD_SWEEP), fshift, lits, 1, xx1));
1489     PetscCall(VecAXPY(xx, 1.0, xx1));
1490     PetscCall(VecDestroy(&xx1));
1491   } else SETERRQ(PetscObjectComm((PetscObject)matin), PETSC_ERR_SUP, "Parallel SOR not supported");
1492 
1493   PetscCall(VecDestroy(&bb1));
1494 
1495   matin->factorerrortype = mat->A->factorerrortype;
1496   PetscFunctionReturn(PETSC_SUCCESS);
1497 }
1498 
1499 static PetscErrorCode MatPermute_MPIAIJ(Mat A, IS rowp, IS colp, Mat *B)
1500 {
1501   Mat             aA, aB, Aperm;
1502   const PetscInt *rwant, *cwant, *gcols, *ai, *bi, *aj, *bj;
1503   PetscScalar    *aa, *ba;
1504   PetscInt        i, j, m, n, ng, anz, bnz, *dnnz, *onnz, *tdnnz, *tonnz, *rdest, *cdest, *work, *gcdest;
1505   PetscSF         rowsf, sf;
1506   IS              parcolp = NULL;
1507   PetscBool       done;
1508 
1509   PetscFunctionBegin;
1510   PetscCall(MatGetLocalSize(A, &m, &n));
1511   PetscCall(ISGetIndices(rowp, &rwant));
1512   PetscCall(ISGetIndices(colp, &cwant));
1513   PetscCall(PetscMalloc3(PetscMax(m, n), &work, m, &rdest, n, &cdest));
1514 
1515   /* Invert row permutation to find out where my rows should go */
1516   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &rowsf));
1517   PetscCall(PetscSFSetGraphLayout(rowsf, A->rmap, A->rmap->n, NULL, PETSC_OWN_POINTER, rwant));
1518   PetscCall(PetscSFSetFromOptions(rowsf));
1519   for (i = 0; i < m; i++) work[i] = A->rmap->rstart + i;
1520   PetscCall(PetscSFReduceBegin(rowsf, MPIU_INT, work, rdest, MPI_REPLACE));
1521   PetscCall(PetscSFReduceEnd(rowsf, MPIU_INT, work, rdest, MPI_REPLACE));
1522 
1523   /* Invert column permutation to find out where my columns should go */
1524   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1525   PetscCall(PetscSFSetGraphLayout(sf, A->cmap, A->cmap->n, NULL, PETSC_OWN_POINTER, cwant));
1526   PetscCall(PetscSFSetFromOptions(sf));
1527   for (i = 0; i < n; i++) work[i] = A->cmap->rstart + i;
1528   PetscCall(PetscSFReduceBegin(sf, MPIU_INT, work, cdest, MPI_REPLACE));
1529   PetscCall(PetscSFReduceEnd(sf, MPIU_INT, work, cdest, MPI_REPLACE));
1530   PetscCall(PetscSFDestroy(&sf));
1531 
1532   PetscCall(ISRestoreIndices(rowp, &rwant));
1533   PetscCall(ISRestoreIndices(colp, &cwant));
1534   PetscCall(MatMPIAIJGetSeqAIJ(A, &aA, &aB, &gcols));
1535 
1536   /* Find out where my gcols should go */
1537   PetscCall(MatGetSize(aB, NULL, &ng));
1538   PetscCall(PetscMalloc1(ng, &gcdest));
1539   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1540   PetscCall(PetscSFSetGraphLayout(sf, A->cmap, ng, NULL, PETSC_OWN_POINTER, gcols));
1541   PetscCall(PetscSFSetFromOptions(sf));
1542   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, cdest, gcdest, MPI_REPLACE));
1543   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, cdest, gcdest, MPI_REPLACE));
1544   PetscCall(PetscSFDestroy(&sf));
1545 
1546   PetscCall(PetscCalloc4(m, &dnnz, m, &onnz, m, &tdnnz, m, &tonnz));
1547   PetscCall(MatGetRowIJ(aA, 0, PETSC_FALSE, PETSC_FALSE, &anz, &ai, &aj, &done));
1548   PetscCall(MatGetRowIJ(aB, 0, PETSC_FALSE, PETSC_FALSE, &bnz, &bi, &bj, &done));
1549   for (i = 0; i < m; i++) {
1550     PetscInt    row = rdest[i];
1551     PetscMPIInt rowner;
1552     PetscCall(PetscLayoutFindOwner(A->rmap, row, &rowner));
1553     for (j = ai[i]; j < ai[i + 1]; j++) {
1554       PetscInt    col = cdest[aj[j]];
1555       PetscMPIInt cowner;
1556       PetscCall(PetscLayoutFindOwner(A->cmap, col, &cowner)); /* Could build an index for the columns to eliminate this search */
1557       if (rowner == cowner) dnnz[i]++;
1558       else onnz[i]++;
1559     }
1560     for (j = bi[i]; j < bi[i + 1]; j++) {
1561       PetscInt    col = gcdest[bj[j]];
1562       PetscMPIInt cowner;
1563       PetscCall(PetscLayoutFindOwner(A->cmap, col, &cowner));
1564       if (rowner == cowner) dnnz[i]++;
1565       else onnz[i]++;
1566     }
1567   }
1568   PetscCall(PetscSFBcastBegin(rowsf, MPIU_INT, dnnz, tdnnz, MPI_REPLACE));
1569   PetscCall(PetscSFBcastEnd(rowsf, MPIU_INT, dnnz, tdnnz, MPI_REPLACE));
1570   PetscCall(PetscSFBcastBegin(rowsf, MPIU_INT, onnz, tonnz, MPI_REPLACE));
1571   PetscCall(PetscSFBcastEnd(rowsf, MPIU_INT, onnz, tonnz, MPI_REPLACE));
1572   PetscCall(PetscSFDestroy(&rowsf));
1573 
1574   PetscCall(MatCreateAIJ(PetscObjectComm((PetscObject)A), A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N, 0, tdnnz, 0, tonnz, &Aperm));
1575   PetscCall(MatSeqAIJGetArray(aA, &aa));
1576   PetscCall(MatSeqAIJGetArray(aB, &ba));
1577   for (i = 0; i < m; i++) {
1578     PetscInt *acols = dnnz, *bcols = onnz; /* Repurpose now-unneeded arrays */
1579     PetscInt  j0, rowlen;
1580     rowlen = ai[i + 1] - ai[i];
1581     for (j0 = j = 0; j < rowlen; j0 = j) { /* rowlen could be larger than number of rows m, so sum in batches */
1582       for (; j < PetscMin(rowlen, j0 + m); j++) acols[j - j0] = cdest[aj[ai[i] + j]];
1583       PetscCall(MatSetValues(Aperm, 1, &rdest[i], j - j0, acols, aa + ai[i] + j0, INSERT_VALUES));
1584     }
1585     rowlen = bi[i + 1] - bi[i];
1586     for (j0 = j = 0; j < rowlen; j0 = j) {
1587       for (; j < PetscMin(rowlen, j0 + m); j++) bcols[j - j0] = gcdest[bj[bi[i] + j]];
1588       PetscCall(MatSetValues(Aperm, 1, &rdest[i], j - j0, bcols, ba + bi[i] + j0, INSERT_VALUES));
1589     }
1590   }
1591   PetscCall(MatAssemblyBegin(Aperm, MAT_FINAL_ASSEMBLY));
1592   PetscCall(MatAssemblyEnd(Aperm, MAT_FINAL_ASSEMBLY));
1593   PetscCall(MatRestoreRowIJ(aA, 0, PETSC_FALSE, PETSC_FALSE, &anz, &ai, &aj, &done));
1594   PetscCall(MatRestoreRowIJ(aB, 0, PETSC_FALSE, PETSC_FALSE, &bnz, &bi, &bj, &done));
1595   PetscCall(MatSeqAIJRestoreArray(aA, &aa));
1596   PetscCall(MatSeqAIJRestoreArray(aB, &ba));
1597   PetscCall(PetscFree4(dnnz, onnz, tdnnz, tonnz));
1598   PetscCall(PetscFree3(work, rdest, cdest));
1599   PetscCall(PetscFree(gcdest));
1600   if (parcolp) PetscCall(ISDestroy(&colp));
1601   *B = Aperm;
1602   PetscFunctionReturn(PETSC_SUCCESS);
1603 }
1604 
1605 static PetscErrorCode MatGetGhosts_MPIAIJ(Mat mat, PetscInt *nghosts, const PetscInt *ghosts[])
1606 {
1607   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1608 
1609   PetscFunctionBegin;
1610   PetscCall(MatGetSize(aij->B, NULL, nghosts));
1611   if (ghosts) *ghosts = aij->garray;
1612   PetscFunctionReturn(PETSC_SUCCESS);
1613 }
1614 
1615 static PetscErrorCode MatGetInfo_MPIAIJ(Mat matin, MatInfoType flag, MatInfo *info)
1616 {
1617   Mat_MPIAIJ    *mat = (Mat_MPIAIJ *)matin->data;
1618   Mat            A = mat->A, B = mat->B;
1619   PetscLogDouble isend[5], irecv[5];
1620 
1621   PetscFunctionBegin;
1622   info->block_size = 1.0;
1623   PetscCall(MatGetInfo(A, MAT_LOCAL, info));
1624 
1625   isend[0] = info->nz_used;
1626   isend[1] = info->nz_allocated;
1627   isend[2] = info->nz_unneeded;
1628   isend[3] = info->memory;
1629   isend[4] = info->mallocs;
1630 
1631   PetscCall(MatGetInfo(B, MAT_LOCAL, info));
1632 
1633   isend[0] += info->nz_used;
1634   isend[1] += info->nz_allocated;
1635   isend[2] += info->nz_unneeded;
1636   isend[3] += info->memory;
1637   isend[4] += info->mallocs;
1638   if (flag == MAT_LOCAL) {
1639     info->nz_used      = isend[0];
1640     info->nz_allocated = isend[1];
1641     info->nz_unneeded  = isend[2];
1642     info->memory       = isend[3];
1643     info->mallocs      = isend[4];
1644   } else if (flag == MAT_GLOBAL_MAX) {
1645     PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_MAX, PetscObjectComm((PetscObject)matin)));
1646 
1647     info->nz_used      = irecv[0];
1648     info->nz_allocated = irecv[1];
1649     info->nz_unneeded  = irecv[2];
1650     info->memory       = irecv[3];
1651     info->mallocs      = irecv[4];
1652   } else if (flag == MAT_GLOBAL_SUM) {
1653     PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_SUM, PetscObjectComm((PetscObject)matin)));
1654 
1655     info->nz_used      = irecv[0];
1656     info->nz_allocated = irecv[1];
1657     info->nz_unneeded  = irecv[2];
1658     info->memory       = irecv[3];
1659     info->mallocs      = irecv[4];
1660   }
1661   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1662   info->fill_ratio_needed = 0;
1663   info->factor_mallocs    = 0;
1664   PetscFunctionReturn(PETSC_SUCCESS);
1665 }
1666 
1667 PetscErrorCode MatSetOption_MPIAIJ(Mat A, MatOption op, PetscBool flg)
1668 {
1669   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1670 
1671   PetscFunctionBegin;
1672   switch (op) {
1673   case MAT_NEW_NONZERO_LOCATIONS:
1674   case MAT_NEW_NONZERO_ALLOCATION_ERR:
1675   case MAT_UNUSED_NONZERO_LOCATION_ERR:
1676   case MAT_KEEP_NONZERO_PATTERN:
1677   case MAT_NEW_NONZERO_LOCATION_ERR:
1678   case MAT_USE_INODES:
1679   case MAT_IGNORE_ZERO_ENTRIES:
1680   case MAT_FORM_EXPLICIT_TRANSPOSE:
1681     MatCheckPreallocated(A, 1);
1682     PetscCall(MatSetOption(a->A, op, flg));
1683     PetscCall(MatSetOption(a->B, op, flg));
1684     break;
1685   case MAT_ROW_ORIENTED:
1686     MatCheckPreallocated(A, 1);
1687     a->roworiented = flg;
1688 
1689     PetscCall(MatSetOption(a->A, op, flg));
1690     PetscCall(MatSetOption(a->B, op, flg));
1691     break;
1692   case MAT_FORCE_DIAGONAL_ENTRIES:
1693   case MAT_SORTED_FULL:
1694     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
1695     break;
1696   case MAT_IGNORE_OFF_PROC_ENTRIES:
1697     a->donotstash = flg;
1698     break;
1699   /* Symmetry flags are handled directly by MatSetOption() and they don't affect preallocation */
1700   case MAT_SPD:
1701   case MAT_SYMMETRIC:
1702   case MAT_STRUCTURALLY_SYMMETRIC:
1703   case MAT_HERMITIAN:
1704   case MAT_SYMMETRY_ETERNAL:
1705   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
1706   case MAT_SPD_ETERNAL:
1707     /* if the diagonal matrix is square it inherits some of the properties above */
1708     break;
1709   case MAT_SUBMAT_SINGLEIS:
1710     A->submat_singleis = flg;
1711     break;
1712   case MAT_STRUCTURE_ONLY:
1713     /* The option is handled directly by MatSetOption() */
1714     break;
1715   default:
1716     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
1717   }
1718   PetscFunctionReturn(PETSC_SUCCESS);
1719 }
1720 
1721 PetscErrorCode MatGetRow_MPIAIJ(Mat matin, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1722 {
1723   Mat_MPIAIJ  *mat = (Mat_MPIAIJ *)matin->data;
1724   PetscScalar *vworkA, *vworkB, **pvA, **pvB, *v_p;
1725   PetscInt     i, *cworkA, *cworkB, **pcA, **pcB, cstart = matin->cmap->rstart;
1726   PetscInt     nztot, nzA, nzB, lrow, rstart = matin->rmap->rstart, rend = matin->rmap->rend;
1727   PetscInt    *cmap, *idx_p;
1728 
1729   PetscFunctionBegin;
1730   PetscCheck(!mat->getrowactive, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Already active");
1731   mat->getrowactive = PETSC_TRUE;
1732 
1733   if (!mat->rowvalues && (idx || v)) {
1734     /*
1735         allocate enough space to hold information from the longest row.
1736     */
1737     Mat_SeqAIJ *Aa = (Mat_SeqAIJ *)mat->A->data, *Ba = (Mat_SeqAIJ *)mat->B->data;
1738     PetscInt    max = 1, tmp;
1739     for (i = 0; i < matin->rmap->n; i++) {
1740       tmp = Aa->i[i + 1] - Aa->i[i] + Ba->i[i + 1] - Ba->i[i];
1741       if (max < tmp) max = tmp;
1742     }
1743     PetscCall(PetscMalloc2(max, &mat->rowvalues, max, &mat->rowindices));
1744   }
1745 
1746   PetscCheck(row >= rstart && row < rend, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Only local rows");
1747   lrow = row - rstart;
1748 
1749   pvA = &vworkA;
1750   pcA = &cworkA;
1751   pvB = &vworkB;
1752   pcB = &cworkB;
1753   if (!v) {
1754     pvA = NULL;
1755     pvB = NULL;
1756   }
1757   if (!idx) {
1758     pcA = NULL;
1759     if (!v) pcB = NULL;
1760   }
1761   PetscCall((*mat->A->ops->getrow)(mat->A, lrow, &nzA, pcA, pvA));
1762   PetscCall((*mat->B->ops->getrow)(mat->B, lrow, &nzB, pcB, pvB));
1763   nztot = nzA + nzB;
1764 
1765   cmap = mat->garray;
1766   if (v || idx) {
1767     if (nztot) {
1768       /* Sort by increasing column numbers, assuming A and B already sorted */
1769       PetscInt imark = -1;
1770       if (v) {
1771         *v = v_p = mat->rowvalues;
1772         for (i = 0; i < nzB; i++) {
1773           if (cmap[cworkB[i]] < cstart) v_p[i] = vworkB[i];
1774           else break;
1775         }
1776         imark = i;
1777         for (i = 0; i < nzA; i++) v_p[imark + i] = vworkA[i];
1778         for (i = imark; i < nzB; i++) v_p[nzA + i] = vworkB[i];
1779       }
1780       if (idx) {
1781         *idx = idx_p = mat->rowindices;
1782         if (imark > -1) {
1783           for (i = 0; i < imark; i++) idx_p[i] = cmap[cworkB[i]];
1784         } else {
1785           for (i = 0; i < nzB; i++) {
1786             if (cmap[cworkB[i]] < cstart) idx_p[i] = cmap[cworkB[i]];
1787             else break;
1788           }
1789           imark = i;
1790         }
1791         for (i = 0; i < nzA; i++) idx_p[imark + i] = cstart + cworkA[i];
1792         for (i = imark; i < nzB; i++) idx_p[nzA + i] = cmap[cworkB[i]];
1793       }
1794     } else {
1795       if (idx) *idx = NULL;
1796       if (v) *v = NULL;
1797     }
1798   }
1799   *nz = nztot;
1800   PetscCall((*mat->A->ops->restorerow)(mat->A, lrow, &nzA, pcA, pvA));
1801   PetscCall((*mat->B->ops->restorerow)(mat->B, lrow, &nzB, pcB, pvB));
1802   PetscFunctionReturn(PETSC_SUCCESS);
1803 }
1804 
1805 PetscErrorCode MatRestoreRow_MPIAIJ(Mat mat, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1806 {
1807   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1808 
1809   PetscFunctionBegin;
1810   PetscCheck(aij->getrowactive, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "MatGetRow() must be called first");
1811   aij->getrowactive = PETSC_FALSE;
1812   PetscFunctionReturn(PETSC_SUCCESS);
1813 }
1814 
1815 static PetscErrorCode MatNorm_MPIAIJ(Mat mat, NormType type, PetscReal *norm)
1816 {
1817   Mat_MPIAIJ      *aij  = (Mat_MPIAIJ *)mat->data;
1818   Mat_SeqAIJ      *amat = (Mat_SeqAIJ *)aij->A->data, *bmat = (Mat_SeqAIJ *)aij->B->data;
1819   PetscInt         i, j, cstart = mat->cmap->rstart;
1820   PetscReal        sum = 0.0;
1821   const MatScalar *v, *amata, *bmata;
1822   PetscMPIInt      iN;
1823 
1824   PetscFunctionBegin;
1825   if (aij->size == 1) {
1826     PetscCall(MatNorm(aij->A, type, norm));
1827   } else {
1828     PetscCall(MatSeqAIJGetArrayRead(aij->A, &amata));
1829     PetscCall(MatSeqAIJGetArrayRead(aij->B, &bmata));
1830     if (type == NORM_FROBENIUS) {
1831       v = amata;
1832       for (i = 0; i < amat->nz; i++) {
1833         sum += PetscRealPart(PetscConj(*v) * (*v));
1834         v++;
1835       }
1836       v = bmata;
1837       for (i = 0; i < bmat->nz; i++) {
1838         sum += PetscRealPart(PetscConj(*v) * (*v));
1839         v++;
1840       }
1841       PetscCallMPI(MPIU_Allreduce(&sum, norm, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)mat)));
1842       *norm = PetscSqrtReal(*norm);
1843       PetscCall(PetscLogFlops(2.0 * amat->nz + 2.0 * bmat->nz));
1844     } else if (type == NORM_1) { /* max column norm */
1845       PetscReal *tmp, *tmp2;
1846       PetscInt  *jj, *garray = aij->garray;
1847       PetscCall(PetscCalloc1(mat->cmap->N + 1, &tmp));
1848       PetscCall(PetscMalloc1(mat->cmap->N + 1, &tmp2));
1849       *norm = 0.0;
1850       v     = amata;
1851       jj    = amat->j;
1852       for (j = 0; j < amat->nz; j++) {
1853         tmp[cstart + *jj++] += PetscAbsScalar(*v);
1854         v++;
1855       }
1856       v  = bmata;
1857       jj = bmat->j;
1858       for (j = 0; j < bmat->nz; j++) {
1859         tmp[garray[*jj++]] += PetscAbsScalar(*v);
1860         v++;
1861       }
1862       PetscCall(PetscMPIIntCast(mat->cmap->N, &iN));
1863       PetscCallMPI(MPIU_Allreduce(tmp, tmp2, iN, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)mat)));
1864       for (j = 0; j < mat->cmap->N; j++) {
1865         if (tmp2[j] > *norm) *norm = tmp2[j];
1866       }
1867       PetscCall(PetscFree(tmp));
1868       PetscCall(PetscFree(tmp2));
1869       PetscCall(PetscLogFlops(PetscMax(amat->nz + bmat->nz - 1, 0)));
1870     } else if (type == NORM_INFINITY) { /* max row norm */
1871       PetscReal ntemp = 0.0;
1872       for (j = 0; j < aij->A->rmap->n; j++) {
1873         v   = PetscSafePointerPlusOffset(amata, amat->i[j]);
1874         sum = 0.0;
1875         for (i = 0; i < amat->i[j + 1] - amat->i[j]; i++) {
1876           sum += PetscAbsScalar(*v);
1877           v++;
1878         }
1879         v = PetscSafePointerPlusOffset(bmata, bmat->i[j]);
1880         for (i = 0; i < bmat->i[j + 1] - bmat->i[j]; i++) {
1881           sum += PetscAbsScalar(*v);
1882           v++;
1883         }
1884         if (sum > ntemp) ntemp = sum;
1885       }
1886       PetscCallMPI(MPIU_Allreduce(&ntemp, norm, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)mat)));
1887       PetscCall(PetscLogFlops(PetscMax(amat->nz + bmat->nz - 1, 0)));
1888     } else SETERRQ(PetscObjectComm((PetscObject)mat), PETSC_ERR_SUP, "No support for two norm");
1889     PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &amata));
1890     PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &bmata));
1891   }
1892   PetscFunctionReturn(PETSC_SUCCESS);
1893 }
1894 
1895 static PetscErrorCode MatTranspose_MPIAIJ(Mat A, MatReuse reuse, Mat *matout)
1896 {
1897   Mat_MPIAIJ      *a    = (Mat_MPIAIJ *)A->data, *b;
1898   Mat_SeqAIJ      *Aloc = (Mat_SeqAIJ *)a->A->data, *Bloc = (Mat_SeqAIJ *)a->B->data, *sub_B_diag;
1899   PetscInt         M = A->rmap->N, N = A->cmap->N, ma, na, mb, nb, row, *cols, *cols_tmp, *B_diag_ilen, i, ncol, A_diag_ncol;
1900   const PetscInt  *ai, *aj, *bi, *bj, *B_diag_i;
1901   Mat              B, A_diag, *B_diag;
1902   const MatScalar *pbv, *bv;
1903 
1904   PetscFunctionBegin;
1905   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *matout));
1906   ma = A->rmap->n;
1907   na = A->cmap->n;
1908   mb = a->B->rmap->n;
1909   nb = a->B->cmap->n;
1910   ai = Aloc->i;
1911   aj = Aloc->j;
1912   bi = Bloc->i;
1913   bj = Bloc->j;
1914   if (reuse == MAT_INITIAL_MATRIX || *matout == A) {
1915     PetscInt            *d_nnz, *g_nnz, *o_nnz;
1916     PetscSFNode         *oloc;
1917     PETSC_UNUSED PetscSF sf;
1918 
1919     PetscCall(PetscMalloc4(na, &d_nnz, na, &o_nnz, nb, &g_nnz, nb, &oloc));
1920     /* compute d_nnz for preallocation */
1921     PetscCall(PetscArrayzero(d_nnz, na));
1922     for (i = 0; i < ai[ma]; i++) d_nnz[aj[i]]++;
1923     /* compute local off-diagonal contributions */
1924     PetscCall(PetscArrayzero(g_nnz, nb));
1925     for (i = 0; i < bi[ma]; i++) g_nnz[bj[i]]++;
1926     /* map those to global */
1927     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1928     PetscCall(PetscSFSetGraphLayout(sf, A->cmap, nb, NULL, PETSC_USE_POINTER, a->garray));
1929     PetscCall(PetscSFSetFromOptions(sf));
1930     PetscCall(PetscArrayzero(o_nnz, na));
1931     PetscCall(PetscSFReduceBegin(sf, MPIU_INT, g_nnz, o_nnz, MPI_SUM));
1932     PetscCall(PetscSFReduceEnd(sf, MPIU_INT, g_nnz, o_nnz, MPI_SUM));
1933     PetscCall(PetscSFDestroy(&sf));
1934 
1935     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
1936     PetscCall(MatSetSizes(B, A->cmap->n, A->rmap->n, N, M));
1937     PetscCall(MatSetBlockSizes(B, PetscAbs(A->cmap->bs), PetscAbs(A->rmap->bs)));
1938     PetscCall(MatSetType(B, ((PetscObject)A)->type_name));
1939     PetscCall(MatMPIAIJSetPreallocation(B, 0, d_nnz, 0, o_nnz));
1940     PetscCall(PetscFree4(d_nnz, o_nnz, g_nnz, oloc));
1941   } else {
1942     B = *matout;
1943     PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
1944   }
1945 
1946   b           = (Mat_MPIAIJ *)B->data;
1947   A_diag      = a->A;
1948   B_diag      = &b->A;
1949   sub_B_diag  = (Mat_SeqAIJ *)(*B_diag)->data;
1950   A_diag_ncol = A_diag->cmap->N;
1951   B_diag_ilen = sub_B_diag->ilen;
1952   B_diag_i    = sub_B_diag->i;
1953 
1954   /* Set ilen for diagonal of B */
1955   for (i = 0; i < A_diag_ncol; i++) B_diag_ilen[i] = B_diag_i[i + 1] - B_diag_i[i];
1956 
1957   /* Transpose the diagonal part of the matrix. In contrast to the off-diagonal part, this can be done
1958   very quickly (=without using MatSetValues), because all writes are local. */
1959   PetscCall(MatTransposeSetPrecursor(A_diag, *B_diag));
1960   PetscCall(MatTranspose(A_diag, MAT_REUSE_MATRIX, B_diag));
1961 
1962   /* copy over the B part */
1963   PetscCall(PetscMalloc1(bi[mb], &cols));
1964   PetscCall(MatSeqAIJGetArrayRead(a->B, &bv));
1965   pbv = bv;
1966   row = A->rmap->rstart;
1967   for (i = 0; i < bi[mb]; i++) cols[i] = a->garray[bj[i]];
1968   cols_tmp = cols;
1969   for (i = 0; i < mb; i++) {
1970     ncol = bi[i + 1] - bi[i];
1971     PetscCall(MatSetValues(B, ncol, cols_tmp, 1, &row, pbv, INSERT_VALUES));
1972     row++;
1973     if (pbv) pbv += ncol;
1974     if (cols_tmp) cols_tmp += ncol;
1975   }
1976   PetscCall(PetscFree(cols));
1977   PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bv));
1978 
1979   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1980   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1981   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) {
1982     *matout = B;
1983   } else {
1984     PetscCall(MatHeaderMerge(A, &B));
1985   }
1986   PetscFunctionReturn(PETSC_SUCCESS);
1987 }
1988 
1989 static PetscErrorCode MatDiagonalScale_MPIAIJ(Mat mat, Vec ll, Vec rr)
1990 {
1991   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1992   Mat         a = aij->A, b = aij->B;
1993   PetscInt    s1, s2, s3;
1994 
1995   PetscFunctionBegin;
1996   PetscCall(MatGetLocalSize(mat, &s2, &s3));
1997   if (rr) {
1998     PetscCall(VecGetLocalSize(rr, &s1));
1999     PetscCheck(s1 == s3, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "right vector non-conforming local size");
2000     /* Overlap communication with computation. */
2001     PetscCall(VecScatterBegin(aij->Mvctx, rr, aij->lvec, INSERT_VALUES, SCATTER_FORWARD));
2002   }
2003   if (ll) {
2004     PetscCall(VecGetLocalSize(ll, &s1));
2005     PetscCheck(s1 == s2, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "left vector non-conforming local size");
2006     PetscUseTypeMethod(b, diagonalscale, ll, NULL);
2007   }
2008   /* scale  the diagonal block */
2009   PetscUseTypeMethod(a, diagonalscale, ll, rr);
2010 
2011   if (rr) {
2012     /* Do a scatter end and then right scale the off-diagonal block */
2013     PetscCall(VecScatterEnd(aij->Mvctx, rr, aij->lvec, INSERT_VALUES, SCATTER_FORWARD));
2014     PetscUseTypeMethod(b, diagonalscale, NULL, aij->lvec);
2015   }
2016   PetscFunctionReturn(PETSC_SUCCESS);
2017 }
2018 
2019 static PetscErrorCode MatSetUnfactored_MPIAIJ(Mat A)
2020 {
2021   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2022 
2023   PetscFunctionBegin;
2024   PetscCall(MatSetUnfactored(a->A));
2025   PetscFunctionReturn(PETSC_SUCCESS);
2026 }
2027 
2028 static PetscErrorCode MatEqual_MPIAIJ(Mat A, Mat B, PetscBool *flag)
2029 {
2030   Mat_MPIAIJ *matB = (Mat_MPIAIJ *)B->data, *matA = (Mat_MPIAIJ *)A->data;
2031   Mat         a, b, c, d;
2032   PetscBool   flg;
2033 
2034   PetscFunctionBegin;
2035   a = matA->A;
2036   b = matA->B;
2037   c = matB->A;
2038   d = matB->B;
2039 
2040   PetscCall(MatEqual(a, c, &flg));
2041   if (flg) PetscCall(MatEqual(b, d, &flg));
2042   PetscCallMPI(MPIU_Allreduce(&flg, flag, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)A)));
2043   PetscFunctionReturn(PETSC_SUCCESS);
2044 }
2045 
2046 static PetscErrorCode MatCopy_MPIAIJ(Mat A, Mat B, MatStructure str)
2047 {
2048   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2049   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2050 
2051   PetscFunctionBegin;
2052   /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
2053   if ((str != SAME_NONZERO_PATTERN) || (A->ops->copy != B->ops->copy)) {
2054     /* because of the column compression in the off-processor part of the matrix a->B,
2055        the number of columns in a->B and b->B may be different, hence we cannot call
2056        the MatCopy() directly on the two parts. If need be, we can provide a more
2057        efficient copy than the MatCopy_Basic() by first uncompressing the a->B matrices
2058        then copying the submatrices */
2059     PetscCall(MatCopy_Basic(A, B, str));
2060   } else {
2061     PetscCall(MatCopy(a->A, b->A, str));
2062     PetscCall(MatCopy(a->B, b->B, str));
2063   }
2064   PetscCall(PetscObjectStateIncrease((PetscObject)B));
2065   PetscFunctionReturn(PETSC_SUCCESS);
2066 }
2067 
2068 /*
2069    Computes the number of nonzeros per row needed for preallocation when X and Y
2070    have different nonzero structure.
2071 */
2072 PetscErrorCode MatAXPYGetPreallocation_MPIX_private(PetscInt m, const PetscInt *xi, const PetscInt *xj, const PetscInt *xltog, const PetscInt *yi, const PetscInt *yj, const PetscInt *yltog, PetscInt *nnz)
2073 {
2074   PetscInt i, j, k, nzx, nzy;
2075 
2076   PetscFunctionBegin;
2077   /* Set the number of nonzeros in the new matrix */
2078   for (i = 0; i < m; i++) {
2079     const PetscInt *xjj = PetscSafePointerPlusOffset(xj, xi[i]), *yjj = PetscSafePointerPlusOffset(yj, yi[i]);
2080     nzx    = xi[i + 1] - xi[i];
2081     nzy    = yi[i + 1] - yi[i];
2082     nnz[i] = 0;
2083     for (j = 0, k = 0; j < nzx; j++) {                                /* Point in X */
2084       for (; k < nzy && yltog[yjj[k]] < xltog[xjj[j]]; k++) nnz[i]++; /* Catch up to X */
2085       if (k < nzy && yltog[yjj[k]] == xltog[xjj[j]]) k++;             /* Skip duplicate */
2086       nnz[i]++;
2087     }
2088     for (; k < nzy; k++) nnz[i]++;
2089   }
2090   PetscFunctionReturn(PETSC_SUCCESS);
2091 }
2092 
2093 /* This is the same as MatAXPYGetPreallocation_SeqAIJ, except that the local-to-global map is provided */
2094 static PetscErrorCode MatAXPYGetPreallocation_MPIAIJ(Mat Y, const PetscInt *yltog, Mat X, const PetscInt *xltog, PetscInt *nnz)
2095 {
2096   PetscInt    m = Y->rmap->N;
2097   Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data;
2098   Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
2099 
2100   PetscFunctionBegin;
2101   PetscCall(MatAXPYGetPreallocation_MPIX_private(m, x->i, x->j, xltog, y->i, y->j, yltog, nnz));
2102   PetscFunctionReturn(PETSC_SUCCESS);
2103 }
2104 
2105 static PetscErrorCode MatAXPY_MPIAIJ(Mat Y, PetscScalar a, Mat X, MatStructure str)
2106 {
2107   Mat_MPIAIJ *xx = (Mat_MPIAIJ *)X->data, *yy = (Mat_MPIAIJ *)Y->data;
2108 
2109   PetscFunctionBegin;
2110   if (str == SAME_NONZERO_PATTERN) {
2111     PetscCall(MatAXPY(yy->A, a, xx->A, str));
2112     PetscCall(MatAXPY(yy->B, a, xx->B, str));
2113   } else if (str == SUBSET_NONZERO_PATTERN) { /* nonzeros of X is a subset of Y's */
2114     PetscCall(MatAXPY_Basic(Y, a, X, str));
2115   } else {
2116     Mat       B;
2117     PetscInt *nnz_d, *nnz_o;
2118 
2119     PetscCall(PetscMalloc1(yy->A->rmap->N, &nnz_d));
2120     PetscCall(PetscMalloc1(yy->B->rmap->N, &nnz_o));
2121     PetscCall(MatCreate(PetscObjectComm((PetscObject)Y), &B));
2122     PetscCall(PetscObjectSetName((PetscObject)B, ((PetscObject)Y)->name));
2123     PetscCall(MatSetLayouts(B, Y->rmap, Y->cmap));
2124     PetscCall(MatSetType(B, ((PetscObject)Y)->type_name));
2125     PetscCall(MatAXPYGetPreallocation_SeqAIJ(yy->A, xx->A, nnz_d));
2126     PetscCall(MatAXPYGetPreallocation_MPIAIJ(yy->B, yy->garray, xx->B, xx->garray, nnz_o));
2127     PetscCall(MatMPIAIJSetPreallocation(B, 0, nnz_d, 0, nnz_o));
2128     PetscCall(MatAXPY_BasicWithPreallocation(B, Y, a, X, str));
2129     PetscCall(MatHeaderMerge(Y, &B));
2130     PetscCall(PetscFree(nnz_d));
2131     PetscCall(PetscFree(nnz_o));
2132   }
2133   PetscFunctionReturn(PETSC_SUCCESS);
2134 }
2135 
2136 PETSC_INTERN PetscErrorCode MatConjugate_SeqAIJ(Mat);
2137 
2138 static PetscErrorCode MatConjugate_MPIAIJ(Mat mat)
2139 {
2140   PetscFunctionBegin;
2141   if (PetscDefined(USE_COMPLEX)) {
2142     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2143 
2144     PetscCall(MatConjugate_SeqAIJ(aij->A));
2145     PetscCall(MatConjugate_SeqAIJ(aij->B));
2146   }
2147   PetscFunctionReturn(PETSC_SUCCESS);
2148 }
2149 
2150 static PetscErrorCode MatRealPart_MPIAIJ(Mat A)
2151 {
2152   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2153 
2154   PetscFunctionBegin;
2155   PetscCall(MatRealPart(a->A));
2156   PetscCall(MatRealPart(a->B));
2157   PetscFunctionReturn(PETSC_SUCCESS);
2158 }
2159 
2160 static PetscErrorCode MatImaginaryPart_MPIAIJ(Mat A)
2161 {
2162   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2163 
2164   PetscFunctionBegin;
2165   PetscCall(MatImaginaryPart(a->A));
2166   PetscCall(MatImaginaryPart(a->B));
2167   PetscFunctionReturn(PETSC_SUCCESS);
2168 }
2169 
2170 static PetscErrorCode MatGetRowMaxAbs_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2171 {
2172   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data;
2173   PetscInt           i, *idxb = NULL, m = A->rmap->n;
2174   PetscScalar       *va, *vv;
2175   Vec                vB, vA;
2176   const PetscScalar *vb;
2177 
2178   PetscFunctionBegin;
2179   PetscCall(MatCreateVecs(a->A, NULL, &vA));
2180   PetscCall(MatGetRowMaxAbs(a->A, vA, idx));
2181 
2182   PetscCall(VecGetArrayWrite(vA, &va));
2183   if (idx) {
2184     for (i = 0; i < m; i++) {
2185       if (PetscAbsScalar(va[i])) idx[i] += A->cmap->rstart;
2186     }
2187   }
2188 
2189   PetscCall(MatCreateVecs(a->B, NULL, &vB));
2190   PetscCall(PetscMalloc1(m, &idxb));
2191   PetscCall(MatGetRowMaxAbs(a->B, vB, idxb));
2192 
2193   PetscCall(VecGetArrayWrite(v, &vv));
2194   PetscCall(VecGetArrayRead(vB, &vb));
2195   for (i = 0; i < m; i++) {
2196     if (PetscAbsScalar(va[i]) < PetscAbsScalar(vb[i])) {
2197       vv[i] = vb[i];
2198       if (idx) idx[i] = a->garray[idxb[i]];
2199     } else {
2200       vv[i] = va[i];
2201       if (idx && PetscAbsScalar(va[i]) == PetscAbsScalar(vb[i]) && idxb[i] != -1 && idx[i] > a->garray[idxb[i]]) idx[i] = a->garray[idxb[i]];
2202     }
2203   }
2204   PetscCall(VecRestoreArrayWrite(vA, &vv));
2205   PetscCall(VecRestoreArrayWrite(vA, &va));
2206   PetscCall(VecRestoreArrayRead(vB, &vb));
2207   PetscCall(PetscFree(idxb));
2208   PetscCall(VecDestroy(&vA));
2209   PetscCall(VecDestroy(&vB));
2210   PetscFunctionReturn(PETSC_SUCCESS);
2211 }
2212 
2213 static PetscErrorCode MatGetRowSumAbs_MPIAIJ(Mat A, Vec v)
2214 {
2215   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2216   Vec         vB, vA;
2217 
2218   PetscFunctionBegin;
2219   PetscCall(MatCreateVecs(a->A, NULL, &vA));
2220   PetscCall(MatGetRowSumAbs(a->A, vA));
2221   PetscCall(MatCreateVecs(a->B, NULL, &vB));
2222   PetscCall(MatGetRowSumAbs(a->B, vB));
2223   PetscCall(VecAXPY(vA, 1.0, vB));
2224   PetscCall(VecDestroy(&vB));
2225   PetscCall(VecCopy(vA, v));
2226   PetscCall(VecDestroy(&vA));
2227   PetscFunctionReturn(PETSC_SUCCESS);
2228 }
2229 
2230 static PetscErrorCode MatGetRowMinAbs_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2231 {
2232   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2233   PetscInt           m = A->rmap->n, n = A->cmap->n;
2234   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2235   PetscInt          *cmap = mat->garray;
2236   PetscInt          *diagIdx, *offdiagIdx;
2237   Vec                diagV, offdiagV;
2238   PetscScalar       *a, *diagA, *offdiagA;
2239   const PetscScalar *ba, *bav;
2240   PetscInt           r, j, col, ncols, *bi, *bj;
2241   Mat                B = mat->B;
2242   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2243 
2244   PetscFunctionBegin;
2245   /* When a process holds entire A and other processes have no entry */
2246   if (A->cmap->N == n) {
2247     PetscCall(VecGetArrayWrite(v, &diagA));
2248     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2249     PetscCall(MatGetRowMinAbs(mat->A, diagV, idx));
2250     PetscCall(VecDestroy(&diagV));
2251     PetscCall(VecRestoreArrayWrite(v, &diagA));
2252     PetscFunctionReturn(PETSC_SUCCESS);
2253   } else if (n == 0) {
2254     if (m) {
2255       PetscCall(VecGetArrayWrite(v, &a));
2256       for (r = 0; r < m; r++) {
2257         a[r] = 0.0;
2258         if (idx) idx[r] = -1;
2259       }
2260       PetscCall(VecRestoreArrayWrite(v, &a));
2261     }
2262     PetscFunctionReturn(PETSC_SUCCESS);
2263   }
2264 
2265   PetscCall(PetscMalloc2(m, &diagIdx, m, &offdiagIdx));
2266   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2267   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2268   PetscCall(MatGetRowMinAbs(mat->A, diagV, diagIdx));
2269 
2270   /* Get offdiagIdx[] for implicit 0.0 */
2271   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2272   ba = bav;
2273   bi = b->i;
2274   bj = b->j;
2275   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2276   for (r = 0; r < m; r++) {
2277     ncols = bi[r + 1] - bi[r];
2278     if (ncols == A->cmap->N - n) { /* Brow is dense */
2279       offdiagA[r]   = *ba;
2280       offdiagIdx[r] = cmap[0];
2281     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2282       offdiagA[r] = 0.0;
2283 
2284       /* Find first hole in the cmap */
2285       for (j = 0; j < ncols; j++) {
2286         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2287         if (col > j && j < cstart) {
2288           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2289           break;
2290         } else if (col > j + n && j >= cstart) {
2291           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2292           break;
2293         }
2294       }
2295       if (j == ncols && ncols < A->cmap->N - n) {
2296         /* a hole is outside compressed Bcols */
2297         if (ncols == 0) {
2298           if (cstart) {
2299             offdiagIdx[r] = 0;
2300           } else offdiagIdx[r] = cend;
2301         } else { /* ncols > 0 */
2302           offdiagIdx[r] = cmap[ncols - 1] + 1;
2303           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2304         }
2305       }
2306     }
2307 
2308     for (j = 0; j < ncols; j++) {
2309       if (PetscAbsScalar(offdiagA[r]) > PetscAbsScalar(*ba)) {
2310         offdiagA[r]   = *ba;
2311         offdiagIdx[r] = cmap[*bj];
2312       }
2313       ba++;
2314       bj++;
2315     }
2316   }
2317 
2318   PetscCall(VecGetArrayWrite(v, &a));
2319   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2320   for (r = 0; r < m; ++r) {
2321     if (PetscAbsScalar(diagA[r]) < PetscAbsScalar(offdiagA[r])) {
2322       a[r] = diagA[r];
2323       if (idx) idx[r] = cstart + diagIdx[r];
2324     } else if (PetscAbsScalar(diagA[r]) == PetscAbsScalar(offdiagA[r])) {
2325       a[r] = diagA[r];
2326       if (idx) {
2327         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2328           idx[r] = cstart + diagIdx[r];
2329         } else idx[r] = offdiagIdx[r];
2330       }
2331     } else {
2332       a[r] = offdiagA[r];
2333       if (idx) idx[r] = offdiagIdx[r];
2334     }
2335   }
2336   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2337   PetscCall(VecRestoreArrayWrite(v, &a));
2338   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2339   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2340   PetscCall(VecDestroy(&diagV));
2341   PetscCall(VecDestroy(&offdiagV));
2342   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2343   PetscFunctionReturn(PETSC_SUCCESS);
2344 }
2345 
2346 static PetscErrorCode MatGetRowMin_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2347 {
2348   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2349   PetscInt           m = A->rmap->n, n = A->cmap->n;
2350   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2351   PetscInt          *cmap = mat->garray;
2352   PetscInt          *diagIdx, *offdiagIdx;
2353   Vec                diagV, offdiagV;
2354   PetscScalar       *a, *diagA, *offdiagA;
2355   const PetscScalar *ba, *bav;
2356   PetscInt           r, j, col, ncols, *bi, *bj;
2357   Mat                B = mat->B;
2358   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2359 
2360   PetscFunctionBegin;
2361   /* When a process holds entire A and other processes have no entry */
2362   if (A->cmap->N == n) {
2363     PetscCall(VecGetArrayWrite(v, &diagA));
2364     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2365     PetscCall(MatGetRowMin(mat->A, diagV, idx));
2366     PetscCall(VecDestroy(&diagV));
2367     PetscCall(VecRestoreArrayWrite(v, &diagA));
2368     PetscFunctionReturn(PETSC_SUCCESS);
2369   } else if (n == 0) {
2370     if (m) {
2371       PetscCall(VecGetArrayWrite(v, &a));
2372       for (r = 0; r < m; r++) {
2373         a[r] = PETSC_MAX_REAL;
2374         if (idx) idx[r] = -1;
2375       }
2376       PetscCall(VecRestoreArrayWrite(v, &a));
2377     }
2378     PetscFunctionReturn(PETSC_SUCCESS);
2379   }
2380 
2381   PetscCall(PetscCalloc2(m, &diagIdx, m, &offdiagIdx));
2382   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2383   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2384   PetscCall(MatGetRowMin(mat->A, diagV, diagIdx));
2385 
2386   /* Get offdiagIdx[] for implicit 0.0 */
2387   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2388   ba = bav;
2389   bi = b->i;
2390   bj = b->j;
2391   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2392   for (r = 0; r < m; r++) {
2393     ncols = bi[r + 1] - bi[r];
2394     if (ncols == A->cmap->N - n) { /* Brow is dense */
2395       offdiagA[r]   = *ba;
2396       offdiagIdx[r] = cmap[0];
2397     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2398       offdiagA[r] = 0.0;
2399 
2400       /* Find first hole in the cmap */
2401       for (j = 0; j < ncols; j++) {
2402         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2403         if (col > j && j < cstart) {
2404           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2405           break;
2406         } else if (col > j + n && j >= cstart) {
2407           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2408           break;
2409         }
2410       }
2411       if (j == ncols && ncols < A->cmap->N - n) {
2412         /* a hole is outside compressed Bcols */
2413         if (ncols == 0) {
2414           if (cstart) {
2415             offdiagIdx[r] = 0;
2416           } else offdiagIdx[r] = cend;
2417         } else { /* ncols > 0 */
2418           offdiagIdx[r] = cmap[ncols - 1] + 1;
2419           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2420         }
2421       }
2422     }
2423 
2424     for (j = 0; j < ncols; j++) {
2425       if (PetscRealPart(offdiagA[r]) > PetscRealPart(*ba)) {
2426         offdiagA[r]   = *ba;
2427         offdiagIdx[r] = cmap[*bj];
2428       }
2429       ba++;
2430       bj++;
2431     }
2432   }
2433 
2434   PetscCall(VecGetArrayWrite(v, &a));
2435   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2436   for (r = 0; r < m; ++r) {
2437     if (PetscRealPart(diagA[r]) < PetscRealPart(offdiagA[r])) {
2438       a[r] = diagA[r];
2439       if (idx) idx[r] = cstart + diagIdx[r];
2440     } else if (PetscRealPart(diagA[r]) == PetscRealPart(offdiagA[r])) {
2441       a[r] = diagA[r];
2442       if (idx) {
2443         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2444           idx[r] = cstart + diagIdx[r];
2445         } else idx[r] = offdiagIdx[r];
2446       }
2447     } else {
2448       a[r] = offdiagA[r];
2449       if (idx) idx[r] = offdiagIdx[r];
2450     }
2451   }
2452   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2453   PetscCall(VecRestoreArrayWrite(v, &a));
2454   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2455   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2456   PetscCall(VecDestroy(&diagV));
2457   PetscCall(VecDestroy(&offdiagV));
2458   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2459   PetscFunctionReturn(PETSC_SUCCESS);
2460 }
2461 
2462 static PetscErrorCode MatGetRowMax_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2463 {
2464   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2465   PetscInt           m = A->rmap->n, n = A->cmap->n;
2466   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2467   PetscInt          *cmap = mat->garray;
2468   PetscInt          *diagIdx, *offdiagIdx;
2469   Vec                diagV, offdiagV;
2470   PetscScalar       *a, *diagA, *offdiagA;
2471   const PetscScalar *ba, *bav;
2472   PetscInt           r, j, col, ncols, *bi, *bj;
2473   Mat                B = mat->B;
2474   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2475 
2476   PetscFunctionBegin;
2477   /* When a process holds entire A and other processes have no entry */
2478   if (A->cmap->N == n) {
2479     PetscCall(VecGetArrayWrite(v, &diagA));
2480     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2481     PetscCall(MatGetRowMax(mat->A, diagV, idx));
2482     PetscCall(VecDestroy(&diagV));
2483     PetscCall(VecRestoreArrayWrite(v, &diagA));
2484     PetscFunctionReturn(PETSC_SUCCESS);
2485   } else if (n == 0) {
2486     if (m) {
2487       PetscCall(VecGetArrayWrite(v, &a));
2488       for (r = 0; r < m; r++) {
2489         a[r] = PETSC_MIN_REAL;
2490         if (idx) idx[r] = -1;
2491       }
2492       PetscCall(VecRestoreArrayWrite(v, &a));
2493     }
2494     PetscFunctionReturn(PETSC_SUCCESS);
2495   }
2496 
2497   PetscCall(PetscMalloc2(m, &diagIdx, m, &offdiagIdx));
2498   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2499   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2500   PetscCall(MatGetRowMax(mat->A, diagV, diagIdx));
2501 
2502   /* Get offdiagIdx[] for implicit 0.0 */
2503   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2504   ba = bav;
2505   bi = b->i;
2506   bj = b->j;
2507   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2508   for (r = 0; r < m; r++) {
2509     ncols = bi[r + 1] - bi[r];
2510     if (ncols == A->cmap->N - n) { /* Brow is dense */
2511       offdiagA[r]   = *ba;
2512       offdiagIdx[r] = cmap[0];
2513     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2514       offdiagA[r] = 0.0;
2515 
2516       /* Find first hole in the cmap */
2517       for (j = 0; j < ncols; j++) {
2518         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2519         if (col > j && j < cstart) {
2520           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2521           break;
2522         } else if (col > j + n && j >= cstart) {
2523           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2524           break;
2525         }
2526       }
2527       if (j == ncols && ncols < A->cmap->N - n) {
2528         /* a hole is outside compressed Bcols */
2529         if (ncols == 0) {
2530           if (cstart) {
2531             offdiagIdx[r] = 0;
2532           } else offdiagIdx[r] = cend;
2533         } else { /* ncols > 0 */
2534           offdiagIdx[r] = cmap[ncols - 1] + 1;
2535           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2536         }
2537       }
2538     }
2539 
2540     for (j = 0; j < ncols; j++) {
2541       if (PetscRealPart(offdiagA[r]) < PetscRealPart(*ba)) {
2542         offdiagA[r]   = *ba;
2543         offdiagIdx[r] = cmap[*bj];
2544       }
2545       ba++;
2546       bj++;
2547     }
2548   }
2549 
2550   PetscCall(VecGetArrayWrite(v, &a));
2551   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2552   for (r = 0; r < m; ++r) {
2553     if (PetscRealPart(diagA[r]) > PetscRealPart(offdiagA[r])) {
2554       a[r] = diagA[r];
2555       if (idx) idx[r] = cstart + diagIdx[r];
2556     } else if (PetscRealPart(diagA[r]) == PetscRealPart(offdiagA[r])) {
2557       a[r] = diagA[r];
2558       if (idx) {
2559         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2560           idx[r] = cstart + diagIdx[r];
2561         } else idx[r] = offdiagIdx[r];
2562       }
2563     } else {
2564       a[r] = offdiagA[r];
2565       if (idx) idx[r] = offdiagIdx[r];
2566     }
2567   }
2568   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2569   PetscCall(VecRestoreArrayWrite(v, &a));
2570   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2571   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2572   PetscCall(VecDestroy(&diagV));
2573   PetscCall(VecDestroy(&offdiagV));
2574   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2575   PetscFunctionReturn(PETSC_SUCCESS);
2576 }
2577 
2578 PetscErrorCode MatGetSeqNonzeroStructure_MPIAIJ(Mat mat, Mat *newmat)
2579 {
2580   Mat *dummy;
2581 
2582   PetscFunctionBegin;
2583   PetscCall(MatCreateSubMatrix_MPIAIJ_All(mat, MAT_DO_NOT_GET_VALUES, MAT_INITIAL_MATRIX, &dummy));
2584   *newmat = *dummy;
2585   PetscCall(PetscFree(dummy));
2586   PetscFunctionReturn(PETSC_SUCCESS);
2587 }
2588 
2589 static PetscErrorCode MatInvertBlockDiagonal_MPIAIJ(Mat A, const PetscScalar **values)
2590 {
2591   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2592 
2593   PetscFunctionBegin;
2594   PetscCall(MatInvertBlockDiagonal(a->A, values));
2595   A->factorerrortype = a->A->factorerrortype;
2596   PetscFunctionReturn(PETSC_SUCCESS);
2597 }
2598 
2599 static PetscErrorCode MatSetRandom_MPIAIJ(Mat x, PetscRandom rctx)
2600 {
2601   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)x->data;
2602 
2603   PetscFunctionBegin;
2604   PetscCheck(x->assembled || x->preallocated, PetscObjectComm((PetscObject)x), PETSC_ERR_ARG_WRONGSTATE, "MatSetRandom on an unassembled and unpreallocated MATMPIAIJ is not allowed");
2605   PetscCall(MatSetRandom(aij->A, rctx));
2606   if (x->assembled) {
2607     PetscCall(MatSetRandom(aij->B, rctx));
2608   } else {
2609     PetscCall(MatSetRandomSkipColumnRange_SeqAIJ_Private(aij->B, x->cmap->rstart, x->cmap->rend, rctx));
2610   }
2611   PetscCall(MatAssemblyBegin(x, MAT_FINAL_ASSEMBLY));
2612   PetscCall(MatAssemblyEnd(x, MAT_FINAL_ASSEMBLY));
2613   PetscFunctionReturn(PETSC_SUCCESS);
2614 }
2615 
2616 static PetscErrorCode MatMPIAIJSetUseScalableIncreaseOverlap_MPIAIJ(Mat A, PetscBool sc)
2617 {
2618   PetscFunctionBegin;
2619   if (sc) A->ops->increaseoverlap = MatIncreaseOverlap_MPIAIJ_Scalable;
2620   else A->ops->increaseoverlap = MatIncreaseOverlap_MPIAIJ;
2621   PetscFunctionReturn(PETSC_SUCCESS);
2622 }
2623 
2624 /*@
2625   MatMPIAIJGetNumberNonzeros - gets the number of nonzeros in the matrix on this MPI rank
2626 
2627   Not Collective
2628 
2629   Input Parameter:
2630 . A - the matrix
2631 
2632   Output Parameter:
2633 . nz - the number of nonzeros
2634 
2635   Level: advanced
2636 
2637 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`
2638 @*/
2639 PetscErrorCode MatMPIAIJGetNumberNonzeros(Mat A, PetscCount *nz)
2640 {
2641   Mat_MPIAIJ *maij = (Mat_MPIAIJ *)A->data;
2642   Mat_SeqAIJ *aaij = (Mat_SeqAIJ *)maij->A->data, *baij = (Mat_SeqAIJ *)maij->B->data;
2643   PetscBool   isaij;
2644 
2645   PetscFunctionBegin;
2646   PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATMPIAIJ, &isaij));
2647   PetscCheck(isaij, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
2648   *nz = aaij->i[A->rmap->n] + baij->i[A->rmap->n];
2649   PetscFunctionReturn(PETSC_SUCCESS);
2650 }
2651 
2652 /*@
2653   MatMPIAIJSetUseScalableIncreaseOverlap - Determine if the matrix uses a scalable algorithm to compute the overlap
2654 
2655   Collective
2656 
2657   Input Parameters:
2658 + A  - the matrix
2659 - sc - `PETSC_TRUE` indicates use the scalable algorithm (default is not to use the scalable algorithm)
2660 
2661   Level: advanced
2662 
2663 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`
2664 @*/
2665 PetscErrorCode MatMPIAIJSetUseScalableIncreaseOverlap(Mat A, PetscBool sc)
2666 {
2667   PetscFunctionBegin;
2668   PetscTryMethod(A, "MatMPIAIJSetUseScalableIncreaseOverlap_C", (Mat, PetscBool), (A, sc));
2669   PetscFunctionReturn(PETSC_SUCCESS);
2670 }
2671 
2672 PetscErrorCode MatSetFromOptions_MPIAIJ(Mat A, PetscOptionItems *PetscOptionsObject)
2673 {
2674   PetscBool sc = PETSC_FALSE, flg;
2675 
2676   PetscFunctionBegin;
2677   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJ options");
2678   if (A->ops->increaseoverlap == MatIncreaseOverlap_MPIAIJ_Scalable) sc = PETSC_TRUE;
2679   PetscCall(PetscOptionsBool("-mat_increase_overlap_scalable", "Use a scalable algorithm to compute the overlap", "MatIncreaseOverlap", sc, &sc, &flg));
2680   if (flg) PetscCall(MatMPIAIJSetUseScalableIncreaseOverlap(A, sc));
2681   PetscOptionsHeadEnd();
2682   PetscFunctionReturn(PETSC_SUCCESS);
2683 }
2684 
2685 static PetscErrorCode MatShift_MPIAIJ(Mat Y, PetscScalar a)
2686 {
2687   Mat_MPIAIJ *maij = (Mat_MPIAIJ *)Y->data;
2688   Mat_SeqAIJ *aij  = (Mat_SeqAIJ *)maij->A->data;
2689 
2690   PetscFunctionBegin;
2691   if (!Y->preallocated) {
2692     PetscCall(MatMPIAIJSetPreallocation(Y, 1, NULL, 0, NULL));
2693   } else if (!aij->nz) { /* It does not matter if diagonals of Y only partially lie in maij->A. We just need an estimated preallocation. */
2694     PetscInt nonew = aij->nonew;
2695     PetscCall(MatSeqAIJSetPreallocation(maij->A, 1, NULL));
2696     aij->nonew = nonew;
2697   }
2698   PetscCall(MatShift_Basic(Y, a));
2699   PetscFunctionReturn(PETSC_SUCCESS);
2700 }
2701 
2702 static PetscErrorCode MatMissingDiagonal_MPIAIJ(Mat A, PetscBool *missing, PetscInt *d)
2703 {
2704   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2705 
2706   PetscFunctionBegin;
2707   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only works for square matrices");
2708   PetscCall(MatMissingDiagonal(a->A, missing, d));
2709   if (d) {
2710     PetscInt rstart;
2711     PetscCall(MatGetOwnershipRange(A, &rstart, NULL));
2712     *d += rstart;
2713   }
2714   PetscFunctionReturn(PETSC_SUCCESS);
2715 }
2716 
2717 static PetscErrorCode MatInvertVariableBlockDiagonal_MPIAIJ(Mat A, PetscInt nblocks, const PetscInt *bsizes, PetscScalar *diag)
2718 {
2719   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2720 
2721   PetscFunctionBegin;
2722   PetscCall(MatInvertVariableBlockDiagonal(a->A, nblocks, bsizes, diag));
2723   PetscFunctionReturn(PETSC_SUCCESS);
2724 }
2725 
2726 static PetscErrorCode MatEliminateZeros_MPIAIJ(Mat A, PetscBool keep)
2727 {
2728   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2729 
2730   PetscFunctionBegin;
2731   PetscCall(MatEliminateZeros_SeqAIJ(a->A, keep));        // possibly keep zero diagonal coefficients
2732   PetscCall(MatEliminateZeros_SeqAIJ(a->B, PETSC_FALSE)); // never keep zero diagonal coefficients
2733   PetscFunctionReturn(PETSC_SUCCESS);
2734 }
2735 
2736 static struct _MatOps MatOps_Values = {MatSetValues_MPIAIJ,
2737                                        MatGetRow_MPIAIJ,
2738                                        MatRestoreRow_MPIAIJ,
2739                                        MatMult_MPIAIJ,
2740                                        /* 4*/ MatMultAdd_MPIAIJ,
2741                                        MatMultTranspose_MPIAIJ,
2742                                        MatMultTransposeAdd_MPIAIJ,
2743                                        NULL,
2744                                        NULL,
2745                                        NULL,
2746                                        /*10*/ NULL,
2747                                        NULL,
2748                                        NULL,
2749                                        MatSOR_MPIAIJ,
2750                                        MatTranspose_MPIAIJ,
2751                                        /*15*/ MatGetInfo_MPIAIJ,
2752                                        MatEqual_MPIAIJ,
2753                                        MatGetDiagonal_MPIAIJ,
2754                                        MatDiagonalScale_MPIAIJ,
2755                                        MatNorm_MPIAIJ,
2756                                        /*20*/ MatAssemblyBegin_MPIAIJ,
2757                                        MatAssemblyEnd_MPIAIJ,
2758                                        MatSetOption_MPIAIJ,
2759                                        MatZeroEntries_MPIAIJ,
2760                                        /*24*/ MatZeroRows_MPIAIJ,
2761                                        NULL,
2762                                        NULL,
2763                                        NULL,
2764                                        NULL,
2765                                        /*29*/ MatSetUp_MPI_Hash,
2766                                        NULL,
2767                                        NULL,
2768                                        MatGetDiagonalBlock_MPIAIJ,
2769                                        NULL,
2770                                        /*34*/ MatDuplicate_MPIAIJ,
2771                                        NULL,
2772                                        NULL,
2773                                        NULL,
2774                                        NULL,
2775                                        /*39*/ MatAXPY_MPIAIJ,
2776                                        MatCreateSubMatrices_MPIAIJ,
2777                                        MatIncreaseOverlap_MPIAIJ,
2778                                        MatGetValues_MPIAIJ,
2779                                        MatCopy_MPIAIJ,
2780                                        /*44*/ MatGetRowMax_MPIAIJ,
2781                                        MatScale_MPIAIJ,
2782                                        MatShift_MPIAIJ,
2783                                        MatDiagonalSet_MPIAIJ,
2784                                        MatZeroRowsColumns_MPIAIJ,
2785                                        /*49*/ MatSetRandom_MPIAIJ,
2786                                        MatGetRowIJ_MPIAIJ,
2787                                        MatRestoreRowIJ_MPIAIJ,
2788                                        NULL,
2789                                        NULL,
2790                                        /*54*/ MatFDColoringCreate_MPIXAIJ,
2791                                        NULL,
2792                                        MatSetUnfactored_MPIAIJ,
2793                                        MatPermute_MPIAIJ,
2794                                        NULL,
2795                                        /*59*/ MatCreateSubMatrix_MPIAIJ,
2796                                        MatDestroy_MPIAIJ,
2797                                        MatView_MPIAIJ,
2798                                        NULL,
2799                                        NULL,
2800                                        /*64*/ NULL,
2801                                        MatMatMatMultNumeric_MPIAIJ_MPIAIJ_MPIAIJ,
2802                                        NULL,
2803                                        NULL,
2804                                        NULL,
2805                                        /*69*/ MatGetRowMaxAbs_MPIAIJ,
2806                                        MatGetRowMinAbs_MPIAIJ,
2807                                        NULL,
2808                                        NULL,
2809                                        NULL,
2810                                        NULL,
2811                                        /*75*/ MatFDColoringApply_AIJ,
2812                                        MatSetFromOptions_MPIAIJ,
2813                                        NULL,
2814                                        NULL,
2815                                        MatFindZeroDiagonals_MPIAIJ,
2816                                        /*80*/ NULL,
2817                                        NULL,
2818                                        NULL,
2819                                        /*83*/ MatLoad_MPIAIJ,
2820                                        NULL,
2821                                        NULL,
2822                                        NULL,
2823                                        NULL,
2824                                        NULL,
2825                                        /*89*/ NULL,
2826                                        NULL,
2827                                        MatMatMultNumeric_MPIAIJ_MPIAIJ,
2828                                        NULL,
2829                                        NULL,
2830                                        /*94*/ MatPtAPNumeric_MPIAIJ_MPIAIJ,
2831                                        NULL,
2832                                        NULL,
2833                                        NULL,
2834                                        MatBindToCPU_MPIAIJ,
2835                                        /*99*/ MatProductSetFromOptions_MPIAIJ,
2836                                        NULL,
2837                                        NULL,
2838                                        MatConjugate_MPIAIJ,
2839                                        NULL,
2840                                        /*104*/ MatSetValuesRow_MPIAIJ,
2841                                        MatRealPart_MPIAIJ,
2842                                        MatImaginaryPart_MPIAIJ,
2843                                        NULL,
2844                                        NULL,
2845                                        /*109*/ NULL,
2846                                        NULL,
2847                                        MatGetRowMin_MPIAIJ,
2848                                        NULL,
2849                                        MatMissingDiagonal_MPIAIJ,
2850                                        /*114*/ MatGetSeqNonzeroStructure_MPIAIJ,
2851                                        NULL,
2852                                        MatGetGhosts_MPIAIJ,
2853                                        NULL,
2854                                        NULL,
2855                                        /*119*/ MatMultDiagonalBlock_MPIAIJ,
2856                                        NULL,
2857                                        NULL,
2858                                        NULL,
2859                                        MatGetMultiProcBlock_MPIAIJ,
2860                                        /*124*/ MatFindNonzeroRows_MPIAIJ,
2861                                        MatGetColumnReductions_MPIAIJ,
2862                                        MatInvertBlockDiagonal_MPIAIJ,
2863                                        MatInvertVariableBlockDiagonal_MPIAIJ,
2864                                        MatCreateSubMatricesMPI_MPIAIJ,
2865                                        /*129*/ NULL,
2866                                        NULL,
2867                                        NULL,
2868                                        MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ,
2869                                        NULL,
2870                                        /*134*/ NULL,
2871                                        NULL,
2872                                        NULL,
2873                                        NULL,
2874                                        NULL,
2875                                        /*139*/ MatSetBlockSizes_MPIAIJ,
2876                                        NULL,
2877                                        NULL,
2878                                        MatFDColoringSetUp_MPIXAIJ,
2879                                        MatFindOffBlockDiagonalEntries_MPIAIJ,
2880                                        MatCreateMPIMatConcatenateSeqMat_MPIAIJ,
2881                                        /*145*/ NULL,
2882                                        NULL,
2883                                        NULL,
2884                                        MatCreateGraph_Simple_AIJ,
2885                                        NULL,
2886                                        /*150*/ NULL,
2887                                        MatEliminateZeros_MPIAIJ,
2888                                        MatGetRowSumAbs_MPIAIJ,
2889                                        NULL,
2890                                        NULL,
2891                                        /*155*/ NULL,
2892                                        MatCopyHashToXAIJ_MPI_Hash};
2893 
2894 static PetscErrorCode MatStoreValues_MPIAIJ(Mat mat)
2895 {
2896   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2897 
2898   PetscFunctionBegin;
2899   PetscCall(MatStoreValues(aij->A));
2900   PetscCall(MatStoreValues(aij->B));
2901   PetscFunctionReturn(PETSC_SUCCESS);
2902 }
2903 
2904 static PetscErrorCode MatRetrieveValues_MPIAIJ(Mat mat)
2905 {
2906   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2907 
2908   PetscFunctionBegin;
2909   PetscCall(MatRetrieveValues(aij->A));
2910   PetscCall(MatRetrieveValues(aij->B));
2911   PetscFunctionReturn(PETSC_SUCCESS);
2912 }
2913 
2914 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJ(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
2915 {
2916   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2917   PetscMPIInt size;
2918 
2919   PetscFunctionBegin;
2920   if (B->hash_active) {
2921     B->ops[0]      = b->cops;
2922     B->hash_active = PETSC_FALSE;
2923   }
2924   PetscCall(PetscLayoutSetUp(B->rmap));
2925   PetscCall(PetscLayoutSetUp(B->cmap));
2926 
2927 #if defined(PETSC_USE_CTABLE)
2928   PetscCall(PetscHMapIDestroy(&b->colmap));
2929 #else
2930   PetscCall(PetscFree(b->colmap));
2931 #endif
2932   PetscCall(PetscFree(b->garray));
2933   PetscCall(VecDestroy(&b->lvec));
2934   PetscCall(VecScatterDestroy(&b->Mvctx));
2935 
2936   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2937 
2938   MatSeqXAIJGetOptions_Private(b->B);
2939   PetscCall(MatDestroy(&b->B));
2940   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
2941   PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
2942   PetscCall(MatSetBlockSizesFromMats(b->B, B, B));
2943   PetscCall(MatSetType(b->B, MATSEQAIJ));
2944   MatSeqXAIJRestoreOptions_Private(b->B);
2945 
2946   MatSeqXAIJGetOptions_Private(b->A);
2947   PetscCall(MatDestroy(&b->A));
2948   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
2949   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
2950   PetscCall(MatSetBlockSizesFromMats(b->A, B, B));
2951   PetscCall(MatSetType(b->A, MATSEQAIJ));
2952   MatSeqXAIJRestoreOptions_Private(b->A);
2953 
2954   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
2955   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
2956   B->preallocated  = PETSC_TRUE;
2957   B->was_assembled = PETSC_FALSE;
2958   B->assembled     = PETSC_FALSE;
2959   PetscFunctionReturn(PETSC_SUCCESS);
2960 }
2961 
2962 static PetscErrorCode MatResetPreallocation_MPIAIJ(Mat B)
2963 {
2964   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2965 
2966   PetscFunctionBegin;
2967   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
2968   PetscCall(PetscLayoutSetUp(B->rmap));
2969   PetscCall(PetscLayoutSetUp(B->cmap));
2970   if (B->assembled || B->was_assembled) PetscCall(MatDisAssemble_MPIAIJ(B, PETSC_TRUE));
2971   else {
2972 #if defined(PETSC_USE_CTABLE)
2973     PetscCall(PetscHMapIDestroy(&b->colmap));
2974 #else
2975     PetscCall(PetscFree(b->colmap));
2976 #endif
2977     PetscCall(PetscFree(b->garray));
2978     PetscCall(VecDestroy(&b->lvec));
2979   }
2980   PetscCall(VecScatterDestroy(&b->Mvctx));
2981 
2982   PetscCall(MatResetPreallocation(b->A));
2983   PetscCall(MatResetPreallocation(b->B));
2984   B->preallocated  = PETSC_TRUE;
2985   B->was_assembled = PETSC_FALSE;
2986   B->assembled     = PETSC_FALSE;
2987   PetscFunctionReturn(PETSC_SUCCESS);
2988 }
2989 
2990 PetscErrorCode MatDuplicate_MPIAIJ(Mat matin, MatDuplicateOption cpvalues, Mat *newmat)
2991 {
2992   Mat         mat;
2993   Mat_MPIAIJ *a, *oldmat = (Mat_MPIAIJ *)matin->data;
2994 
2995   PetscFunctionBegin;
2996   *newmat = NULL;
2997   PetscCall(MatCreate(PetscObjectComm((PetscObject)matin), &mat));
2998   PetscCall(MatSetSizes(mat, matin->rmap->n, matin->cmap->n, matin->rmap->N, matin->cmap->N));
2999   PetscCall(MatSetBlockSizesFromMats(mat, matin, matin));
3000   PetscCall(MatSetType(mat, ((PetscObject)matin)->type_name));
3001   a = (Mat_MPIAIJ *)mat->data;
3002 
3003   mat->factortype = matin->factortype;
3004   mat->assembled  = matin->assembled;
3005   mat->insertmode = NOT_SET_VALUES;
3006 
3007   a->size         = oldmat->size;
3008   a->rank         = oldmat->rank;
3009   a->donotstash   = oldmat->donotstash;
3010   a->roworiented  = oldmat->roworiented;
3011   a->rowindices   = NULL;
3012   a->rowvalues    = NULL;
3013   a->getrowactive = PETSC_FALSE;
3014 
3015   PetscCall(PetscLayoutReference(matin->rmap, &mat->rmap));
3016   PetscCall(PetscLayoutReference(matin->cmap, &mat->cmap));
3017   if (matin->hash_active) {
3018     PetscCall(MatSetUp(mat));
3019   } else {
3020     mat->preallocated = matin->preallocated;
3021     if (oldmat->colmap) {
3022 #if defined(PETSC_USE_CTABLE)
3023       PetscCall(PetscHMapIDuplicate(oldmat->colmap, &a->colmap));
3024 #else
3025       PetscCall(PetscMalloc1(mat->cmap->N, &a->colmap));
3026       PetscCall(PetscArraycpy(a->colmap, oldmat->colmap, mat->cmap->N));
3027 #endif
3028     } else a->colmap = NULL;
3029     if (oldmat->garray) {
3030       PetscInt len;
3031       len = oldmat->B->cmap->n;
3032       PetscCall(PetscMalloc1(len + 1, &a->garray));
3033       if (len) PetscCall(PetscArraycpy(a->garray, oldmat->garray, len));
3034     } else a->garray = NULL;
3035 
3036     /* It may happen MatDuplicate is called with a non-assembled matrix
3037       In fact, MatDuplicate only requires the matrix to be preallocated
3038       This may happen inside a DMCreateMatrix_Shell */
3039     if (oldmat->lvec) PetscCall(VecDuplicate(oldmat->lvec, &a->lvec));
3040     if (oldmat->Mvctx) {
3041       a->Mvctx = oldmat->Mvctx;
3042       PetscCall(PetscObjectReference((PetscObject)oldmat->Mvctx));
3043     }
3044     PetscCall(MatDuplicate(oldmat->A, cpvalues, &a->A));
3045     PetscCall(MatDuplicate(oldmat->B, cpvalues, &a->B));
3046   }
3047   PetscCall(PetscFunctionListDuplicate(((PetscObject)matin)->qlist, &((PetscObject)mat)->qlist));
3048   *newmat = mat;
3049   PetscFunctionReturn(PETSC_SUCCESS);
3050 }
3051 
3052 PetscErrorCode MatLoad_MPIAIJ(Mat newMat, PetscViewer viewer)
3053 {
3054   PetscBool isbinary, ishdf5;
3055 
3056   PetscFunctionBegin;
3057   PetscValidHeaderSpecific(newMat, MAT_CLASSID, 1);
3058   PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
3059   /* force binary viewer to load .info file if it has not yet done so */
3060   PetscCall(PetscViewerSetUp(viewer));
3061   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
3062   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERHDF5, &ishdf5));
3063   if (isbinary) {
3064     PetscCall(MatLoad_MPIAIJ_Binary(newMat, viewer));
3065   } else if (ishdf5) {
3066 #if defined(PETSC_HAVE_HDF5)
3067     PetscCall(MatLoad_AIJ_HDF5(newMat, viewer));
3068 #else
3069     SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "HDF5 not supported in this build.\nPlease reconfigure using --download-hdf5");
3070 #endif
3071   } else {
3072     SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "Viewer type %s not yet supported for reading %s matrices", ((PetscObject)viewer)->type_name, ((PetscObject)newMat)->type_name);
3073   }
3074   PetscFunctionReturn(PETSC_SUCCESS);
3075 }
3076 
3077 PetscErrorCode MatLoad_MPIAIJ_Binary(Mat mat, PetscViewer viewer)
3078 {
3079   PetscInt     header[4], M, N, m, nz, rows, cols, sum, i;
3080   PetscInt    *rowidxs, *colidxs;
3081   PetscScalar *matvals;
3082 
3083   PetscFunctionBegin;
3084   PetscCall(PetscViewerSetUp(viewer));
3085 
3086   /* read in matrix header */
3087   PetscCall(PetscViewerBinaryRead(viewer, header, 4, NULL, PETSC_INT));
3088   PetscCheck(header[0] == MAT_FILE_CLASSID, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Not a matrix object in file");
3089   M  = header[1];
3090   N  = header[2];
3091   nz = header[3];
3092   PetscCheck(M >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix row size (%" PetscInt_FMT ") in file is negative", M);
3093   PetscCheck(N >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix column size (%" PetscInt_FMT ") in file is negative", N);
3094   PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix stored in special format on disk, cannot load as MPIAIJ");
3095 
3096   /* set block sizes from the viewer's .info file */
3097   PetscCall(MatLoad_Binary_BlockSizes(mat, viewer));
3098   /* set global sizes if not set already */
3099   if (mat->rmap->N < 0) mat->rmap->N = M;
3100   if (mat->cmap->N < 0) mat->cmap->N = N;
3101   PetscCall(PetscLayoutSetUp(mat->rmap));
3102   PetscCall(PetscLayoutSetUp(mat->cmap));
3103 
3104   /* check if the matrix sizes are correct */
3105   PetscCall(MatGetSize(mat, &rows, &cols));
3106   PetscCheck(M == rows && N == cols, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix in file of different sizes (%" PetscInt_FMT ", %" PetscInt_FMT ") than the input matrix (%" PetscInt_FMT ", %" PetscInt_FMT ")", M, N, rows, cols);
3107 
3108   /* read in row lengths and build row indices */
3109   PetscCall(MatGetLocalSize(mat, &m, NULL));
3110   PetscCall(PetscMalloc1(m + 1, &rowidxs));
3111   PetscCall(PetscViewerBinaryReadAll(viewer, rowidxs + 1, m, PETSC_DECIDE, M, PETSC_INT));
3112   rowidxs[0] = 0;
3113   for (i = 0; i < m; i++) rowidxs[i + 1] += rowidxs[i];
3114   if (nz != PETSC_INT_MAX) {
3115     PetscCallMPI(MPIU_Allreduce(&rowidxs[m], &sum, 1, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)viewer)));
3116     PetscCheck(sum == nz, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Inconsistent matrix data in file: nonzeros = %" PetscInt_FMT ", sum-row-lengths = %" PetscInt_FMT, nz, sum);
3117   }
3118 
3119   /* read in column indices and matrix values */
3120   PetscCall(PetscMalloc2(rowidxs[m], &colidxs, rowidxs[m], &matvals));
3121   PetscCall(PetscViewerBinaryReadAll(viewer, colidxs, rowidxs[m], PETSC_DETERMINE, PETSC_DETERMINE, PETSC_INT));
3122   PetscCall(PetscViewerBinaryReadAll(viewer, matvals, rowidxs[m], PETSC_DETERMINE, PETSC_DETERMINE, PETSC_SCALAR));
3123   /* store matrix indices and values */
3124   PetscCall(MatMPIAIJSetPreallocationCSR(mat, rowidxs, colidxs, matvals));
3125   PetscCall(PetscFree(rowidxs));
3126   PetscCall(PetscFree2(colidxs, matvals));
3127   PetscFunctionReturn(PETSC_SUCCESS);
3128 }
3129 
3130 /* Not scalable because of ISAllGather() unless getting all columns. */
3131 static PetscErrorCode ISGetSeqIS_Private(Mat mat, IS iscol, IS *isseq)
3132 {
3133   IS          iscol_local;
3134   PetscBool   isstride;
3135   PetscMPIInt lisstride = 0, gisstride;
3136 
3137   PetscFunctionBegin;
3138   /* check if we are grabbing all columns*/
3139   PetscCall(PetscObjectTypeCompare((PetscObject)iscol, ISSTRIDE, &isstride));
3140 
3141   if (isstride) {
3142     PetscInt start, len, mstart, mlen;
3143     PetscCall(ISStrideGetInfo(iscol, &start, NULL));
3144     PetscCall(ISGetLocalSize(iscol, &len));
3145     PetscCall(MatGetOwnershipRangeColumn(mat, &mstart, &mlen));
3146     if (mstart == start && mlen - mstart == len) lisstride = 1;
3147   }
3148 
3149   PetscCallMPI(MPIU_Allreduce(&lisstride, &gisstride, 1, MPI_INT, MPI_MIN, PetscObjectComm((PetscObject)mat)));
3150   if (gisstride) {
3151     PetscInt N;
3152     PetscCall(MatGetSize(mat, NULL, &N));
3153     PetscCall(ISCreateStride(PETSC_COMM_SELF, N, 0, 1, &iscol_local));
3154     PetscCall(ISSetIdentity(iscol_local));
3155     PetscCall(PetscInfo(mat, "Optimizing for obtaining all columns of the matrix; skipping ISAllGather()\n"));
3156   } else {
3157     PetscInt cbs;
3158     PetscCall(ISGetBlockSize(iscol, &cbs));
3159     PetscCall(ISAllGather(iscol, &iscol_local));
3160     PetscCall(ISSetBlockSize(iscol_local, cbs));
3161   }
3162 
3163   *isseq = iscol_local;
3164   PetscFunctionReturn(PETSC_SUCCESS);
3165 }
3166 
3167 /*
3168  Used by MatCreateSubMatrix_MPIAIJ_SameRowColDist() to avoid ISAllGather() and global size of iscol_local
3169  (see MatCreateSubMatrix_MPIAIJ_nonscalable)
3170 
3171  Input Parameters:
3172 +   mat - matrix
3173 .   isrow - parallel row index set; its local indices are a subset of local columns of `mat`,
3174            i.e., mat->rstart <= isrow[i] < mat->rend
3175 -   iscol - parallel column index set; its local indices are a subset of local columns of `mat`,
3176            i.e., mat->cstart <= iscol[i] < mat->cend
3177 
3178  Output Parameters:
3179 +   isrow_d - sequential row index set for retrieving mat->A
3180 .   iscol_d - sequential  column index set for retrieving mat->A
3181 .   iscol_o - sequential column index set for retrieving mat->B
3182 -   garray - column map; garray[i] indicates global location of iscol_o[i] in `iscol`
3183  */
3184 static PetscErrorCode ISGetSeqIS_SameColDist_Private(Mat mat, IS isrow, IS iscol, IS *isrow_d, IS *iscol_d, IS *iscol_o, PetscInt *garray[])
3185 {
3186   Vec             x, cmap;
3187   const PetscInt *is_idx;
3188   PetscScalar    *xarray, *cmaparray;
3189   PetscInt        ncols, isstart, *idx, m, rstart, *cmap1, count;
3190   Mat_MPIAIJ     *a    = (Mat_MPIAIJ *)mat->data;
3191   Mat             B    = a->B;
3192   Vec             lvec = a->lvec, lcmap;
3193   PetscInt        i, cstart, cend, Bn = B->cmap->N;
3194   MPI_Comm        comm;
3195   VecScatter      Mvctx = a->Mvctx;
3196 
3197   PetscFunctionBegin;
3198   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3199   PetscCall(ISGetLocalSize(iscol, &ncols));
3200 
3201   /* (1) iscol is a sub-column vector of mat, pad it with '-1.' to form a full vector x */
3202   PetscCall(MatCreateVecs(mat, &x, NULL));
3203   PetscCall(VecSet(x, -1.0));
3204   PetscCall(VecDuplicate(x, &cmap));
3205   PetscCall(VecSet(cmap, -1.0));
3206 
3207   /* Get start indices */
3208   PetscCallMPI(MPI_Scan(&ncols, &isstart, 1, MPIU_INT, MPI_SUM, comm));
3209   isstart -= ncols;
3210   PetscCall(MatGetOwnershipRangeColumn(mat, &cstart, &cend));
3211 
3212   PetscCall(ISGetIndices(iscol, &is_idx));
3213   PetscCall(VecGetArray(x, &xarray));
3214   PetscCall(VecGetArray(cmap, &cmaparray));
3215   PetscCall(PetscMalloc1(ncols, &idx));
3216   for (i = 0; i < ncols; i++) {
3217     xarray[is_idx[i] - cstart]    = (PetscScalar)is_idx[i];
3218     cmaparray[is_idx[i] - cstart] = i + isstart;        /* global index of iscol[i] */
3219     idx[i]                        = is_idx[i] - cstart; /* local index of iscol[i]  */
3220   }
3221   PetscCall(VecRestoreArray(x, &xarray));
3222   PetscCall(VecRestoreArray(cmap, &cmaparray));
3223   PetscCall(ISRestoreIndices(iscol, &is_idx));
3224 
3225   /* Get iscol_d */
3226   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, iscol_d));
3227   PetscCall(ISGetBlockSize(iscol, &i));
3228   PetscCall(ISSetBlockSize(*iscol_d, i));
3229 
3230   /* Get isrow_d */
3231   PetscCall(ISGetLocalSize(isrow, &m));
3232   rstart = mat->rmap->rstart;
3233   PetscCall(PetscMalloc1(m, &idx));
3234   PetscCall(ISGetIndices(isrow, &is_idx));
3235   for (i = 0; i < m; i++) idx[i] = is_idx[i] - rstart;
3236   PetscCall(ISRestoreIndices(isrow, &is_idx));
3237 
3238   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, m, idx, PETSC_OWN_POINTER, isrow_d));
3239   PetscCall(ISGetBlockSize(isrow, &i));
3240   PetscCall(ISSetBlockSize(*isrow_d, i));
3241 
3242   /* (2) Scatter x and cmap using aij->Mvctx to get their off-process portions (see MatMult_MPIAIJ) */
3243   PetscCall(VecScatterBegin(Mvctx, x, lvec, INSERT_VALUES, SCATTER_FORWARD));
3244   PetscCall(VecScatterEnd(Mvctx, x, lvec, INSERT_VALUES, SCATTER_FORWARD));
3245 
3246   PetscCall(VecDuplicate(lvec, &lcmap));
3247 
3248   PetscCall(VecScatterBegin(Mvctx, cmap, lcmap, INSERT_VALUES, SCATTER_FORWARD));
3249   PetscCall(VecScatterEnd(Mvctx, cmap, lcmap, INSERT_VALUES, SCATTER_FORWARD));
3250 
3251   /* (3) create sequential iscol_o (a subset of iscol) and isgarray */
3252   /* off-process column indices */
3253   count = 0;
3254   PetscCall(PetscMalloc1(Bn, &idx));
3255   PetscCall(PetscMalloc1(Bn, &cmap1));
3256 
3257   PetscCall(VecGetArray(lvec, &xarray));
3258   PetscCall(VecGetArray(lcmap, &cmaparray));
3259   for (i = 0; i < Bn; i++) {
3260     if (PetscRealPart(xarray[i]) > -1.0) {
3261       idx[count]   = i;                                     /* local column index in off-diagonal part B */
3262       cmap1[count] = (PetscInt)PetscRealPart(cmaparray[i]); /* column index in submat */
3263       count++;
3264     }
3265   }
3266   PetscCall(VecRestoreArray(lvec, &xarray));
3267   PetscCall(VecRestoreArray(lcmap, &cmaparray));
3268 
3269   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, idx, PETSC_COPY_VALUES, iscol_o));
3270   /* cannot ensure iscol_o has same blocksize as iscol! */
3271 
3272   PetscCall(PetscFree(idx));
3273   *garray = cmap1;
3274 
3275   PetscCall(VecDestroy(&x));
3276   PetscCall(VecDestroy(&cmap));
3277   PetscCall(VecDestroy(&lcmap));
3278   PetscFunctionReturn(PETSC_SUCCESS);
3279 }
3280 
3281 /* isrow and iscol have same processor distribution as mat, output *submat is a submatrix of local mat */
3282 PetscErrorCode MatCreateSubMatrix_MPIAIJ_SameRowColDist(Mat mat, IS isrow, IS iscol, MatReuse call, Mat *submat)
3283 {
3284   Mat_MPIAIJ *a = (Mat_MPIAIJ *)mat->data, *asub;
3285   Mat         M = NULL;
3286   MPI_Comm    comm;
3287   IS          iscol_d, isrow_d, iscol_o;
3288   Mat         Asub = NULL, Bsub = NULL;
3289   PetscInt    n;
3290 
3291   PetscFunctionBegin;
3292   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3293 
3294   if (call == MAT_REUSE_MATRIX) {
3295     /* Retrieve isrow_d, iscol_d and iscol_o from submat */
3296     PetscCall(PetscObjectQuery((PetscObject)*submat, "isrow_d", (PetscObject *)&isrow_d));
3297     PetscCheck(isrow_d, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "isrow_d passed in was not used before, cannot reuse");
3298 
3299     PetscCall(PetscObjectQuery((PetscObject)*submat, "iscol_d", (PetscObject *)&iscol_d));
3300     PetscCheck(iscol_d, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_d passed in was not used before, cannot reuse");
3301 
3302     PetscCall(PetscObjectQuery((PetscObject)*submat, "iscol_o", (PetscObject *)&iscol_o));
3303     PetscCheck(iscol_o, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_o passed in was not used before, cannot reuse");
3304 
3305     /* Update diagonal and off-diagonal portions of submat */
3306     asub = (Mat_MPIAIJ *)(*submat)->data;
3307     PetscCall(MatCreateSubMatrix_SeqAIJ(a->A, isrow_d, iscol_d, PETSC_DECIDE, MAT_REUSE_MATRIX, &asub->A));
3308     PetscCall(ISGetLocalSize(iscol_o, &n));
3309     if (n) PetscCall(MatCreateSubMatrix_SeqAIJ(a->B, isrow_d, iscol_o, PETSC_DECIDE, MAT_REUSE_MATRIX, &asub->B));
3310     PetscCall(MatAssemblyBegin(*submat, MAT_FINAL_ASSEMBLY));
3311     PetscCall(MatAssemblyEnd(*submat, MAT_FINAL_ASSEMBLY));
3312 
3313   } else { /* call == MAT_INITIAL_MATRIX) */
3314     PetscInt *garray;
3315     PetscInt  BsubN;
3316 
3317     /* Create isrow_d, iscol_d, iscol_o and isgarray (replace isgarray with array?) */
3318     PetscCall(ISGetSeqIS_SameColDist_Private(mat, isrow, iscol, &isrow_d, &iscol_d, &iscol_o, &garray));
3319 
3320     /* Create local submatrices Asub and Bsub */
3321     PetscCall(MatCreateSubMatrix_SeqAIJ(a->A, isrow_d, iscol_d, PETSC_DECIDE, MAT_INITIAL_MATRIX, &Asub));
3322     PetscCall(MatCreateSubMatrix_SeqAIJ(a->B, isrow_d, iscol_o, PETSC_DECIDE, MAT_INITIAL_MATRIX, &Bsub));
3323 
3324     /* Create submatrix M */
3325     PetscCall(MatCreateMPIAIJWithSeqAIJ(comm, Asub, Bsub, garray, &M));
3326 
3327     /* If Bsub has empty columns, compress iscol_o such that it will retrieve condensed Bsub from a->B during reuse */
3328     asub = (Mat_MPIAIJ *)M->data;
3329 
3330     PetscCall(ISGetLocalSize(iscol_o, &BsubN));
3331     n = asub->B->cmap->N;
3332     if (BsubN > n) {
3333       /* This case can be tested using ~petsc/src/tao/bound/tutorials/runplate2_3 */
3334       const PetscInt *idx;
3335       PetscInt        i, j, *idx_new, *subgarray = asub->garray;
3336       PetscCall(PetscInfo(M, "submatrix Bn %" PetscInt_FMT " != BsubN %" PetscInt_FMT ", update iscol_o\n", n, BsubN));
3337 
3338       PetscCall(PetscMalloc1(n, &idx_new));
3339       j = 0;
3340       PetscCall(ISGetIndices(iscol_o, &idx));
3341       for (i = 0; i < n; i++) {
3342         if (j >= BsubN) break;
3343         while (subgarray[i] > garray[j]) j++;
3344 
3345         if (subgarray[i] == garray[j]) {
3346           idx_new[i] = idx[j++];
3347         } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "subgarray[%" PetscInt_FMT "]=%" PetscInt_FMT " cannot < garray[%" PetscInt_FMT "]=%" PetscInt_FMT, i, subgarray[i], j, garray[j]);
3348       }
3349       PetscCall(ISRestoreIndices(iscol_o, &idx));
3350 
3351       PetscCall(ISDestroy(&iscol_o));
3352       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, idx_new, PETSC_OWN_POINTER, &iscol_o));
3353 
3354     } else if (BsubN < n) {
3355       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Columns of Bsub (%" PetscInt_FMT ") cannot be smaller than B's (%" PetscInt_FMT ")", BsubN, asub->B->cmap->N);
3356     }
3357 
3358     PetscCall(PetscFree(garray));
3359     *submat = M;
3360 
3361     /* Save isrow_d, iscol_d and iscol_o used in processor for next request */
3362     PetscCall(PetscObjectCompose((PetscObject)M, "isrow_d", (PetscObject)isrow_d));
3363     PetscCall(ISDestroy(&isrow_d));
3364 
3365     PetscCall(PetscObjectCompose((PetscObject)M, "iscol_d", (PetscObject)iscol_d));
3366     PetscCall(ISDestroy(&iscol_d));
3367 
3368     PetscCall(PetscObjectCompose((PetscObject)M, "iscol_o", (PetscObject)iscol_o));
3369     PetscCall(ISDestroy(&iscol_o));
3370   }
3371   PetscFunctionReturn(PETSC_SUCCESS);
3372 }
3373 
3374 PetscErrorCode MatCreateSubMatrix_MPIAIJ(Mat mat, IS isrow, IS iscol, MatReuse call, Mat *newmat)
3375 {
3376   IS        iscol_local = NULL, isrow_d;
3377   PetscInt  csize;
3378   PetscInt  n, i, j, start, end;
3379   PetscBool sameRowDist = PETSC_FALSE, sameDist[2], tsameDist[2];
3380   MPI_Comm  comm;
3381 
3382   PetscFunctionBegin;
3383   /* If isrow has same processor distribution as mat,
3384      call MatCreateSubMatrix_MPIAIJ_SameRowDist() to avoid using a hash table with global size of iscol */
3385   if (call == MAT_REUSE_MATRIX) {
3386     PetscCall(PetscObjectQuery((PetscObject)*newmat, "isrow_d", (PetscObject *)&isrow_d));
3387     if (isrow_d) {
3388       sameRowDist  = PETSC_TRUE;
3389       tsameDist[1] = PETSC_TRUE; /* sameColDist */
3390     } else {
3391       PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_local));
3392       if (iscol_local) {
3393         sameRowDist  = PETSC_TRUE;
3394         tsameDist[1] = PETSC_FALSE; /* !sameColDist */
3395       }
3396     }
3397   } else {
3398     /* Check if isrow has same processor distribution as mat */
3399     sameDist[0] = PETSC_FALSE;
3400     PetscCall(ISGetLocalSize(isrow, &n));
3401     if (!n) {
3402       sameDist[0] = PETSC_TRUE;
3403     } else {
3404       PetscCall(ISGetMinMax(isrow, &i, &j));
3405       PetscCall(MatGetOwnershipRange(mat, &start, &end));
3406       if (i >= start && j < end) sameDist[0] = PETSC_TRUE;
3407     }
3408 
3409     /* Check if iscol has same processor distribution as mat */
3410     sameDist[1] = PETSC_FALSE;
3411     PetscCall(ISGetLocalSize(iscol, &n));
3412     if (!n) {
3413       sameDist[1] = PETSC_TRUE;
3414     } else {
3415       PetscCall(ISGetMinMax(iscol, &i, &j));
3416       PetscCall(MatGetOwnershipRangeColumn(mat, &start, &end));
3417       if (i >= start && j < end) sameDist[1] = PETSC_TRUE;
3418     }
3419 
3420     PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3421     PetscCallMPI(MPIU_Allreduce(&sameDist, &tsameDist, 2, MPIU_BOOL, MPI_LAND, comm));
3422     sameRowDist = tsameDist[0];
3423   }
3424 
3425   if (sameRowDist) {
3426     if (tsameDist[1]) { /* sameRowDist & sameColDist */
3427       /* isrow and iscol have same processor distribution as mat */
3428       PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowColDist(mat, isrow, iscol, call, newmat));
3429       PetscFunctionReturn(PETSC_SUCCESS);
3430     } else { /* sameRowDist */
3431       /* isrow has same processor distribution as mat */
3432       if (call == MAT_INITIAL_MATRIX) {
3433         PetscBool sorted;
3434         PetscCall(ISGetSeqIS_Private(mat, iscol, &iscol_local));
3435         PetscCall(ISGetLocalSize(iscol_local, &n)); /* local size of iscol_local = global columns of newmat */
3436         PetscCall(ISGetSize(iscol, &i));
3437         PetscCheck(n == i, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "n %" PetscInt_FMT " != size of iscol %" PetscInt_FMT, n, i);
3438 
3439         PetscCall(ISSorted(iscol_local, &sorted));
3440         if (sorted) {
3441           /* MatCreateSubMatrix_MPIAIJ_SameRowDist() requires iscol_local be sorted; it can have duplicate indices */
3442           PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowDist(mat, isrow, iscol, iscol_local, MAT_INITIAL_MATRIX, newmat));
3443           PetscFunctionReturn(PETSC_SUCCESS);
3444         }
3445       } else { /* call == MAT_REUSE_MATRIX */
3446         IS iscol_sub;
3447         PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_sub));
3448         if (iscol_sub) {
3449           PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowDist(mat, isrow, iscol, NULL, call, newmat));
3450           PetscFunctionReturn(PETSC_SUCCESS);
3451         }
3452       }
3453     }
3454   }
3455 
3456   /* General case: iscol -> iscol_local which has global size of iscol */
3457   if (call == MAT_REUSE_MATRIX) {
3458     PetscCall(PetscObjectQuery((PetscObject)*newmat, "ISAllGather", (PetscObject *)&iscol_local));
3459     PetscCheck(iscol_local, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3460   } else {
3461     if (!iscol_local) PetscCall(ISGetSeqIS_Private(mat, iscol, &iscol_local));
3462   }
3463 
3464   PetscCall(ISGetLocalSize(iscol, &csize));
3465   PetscCall(MatCreateSubMatrix_MPIAIJ_nonscalable(mat, isrow, iscol_local, csize, call, newmat));
3466 
3467   if (call == MAT_INITIAL_MATRIX) {
3468     PetscCall(PetscObjectCompose((PetscObject)*newmat, "ISAllGather", (PetscObject)iscol_local));
3469     PetscCall(ISDestroy(&iscol_local));
3470   }
3471   PetscFunctionReturn(PETSC_SUCCESS);
3472 }
3473 
3474 /*@C
3475   MatCreateMPIAIJWithSeqAIJ - creates a `MATMPIAIJ` matrix using `MATSEQAIJ` matrices that contain the "diagonal"
3476   and "off-diagonal" part of the matrix in CSR format.
3477 
3478   Collective
3479 
3480   Input Parameters:
3481 + comm   - MPI communicator
3482 . A      - "diagonal" portion of matrix
3483 . B      - "off-diagonal" portion of matrix, may have empty columns, will be destroyed by this routine
3484 - garray - global index of `B` columns
3485 
3486   Output Parameter:
3487 . mat - the matrix, with input `A` as its local diagonal matrix
3488 
3489   Level: advanced
3490 
3491   Notes:
3492   See `MatCreateAIJ()` for the definition of "diagonal" and "off-diagonal" portion of the matrix.
3493 
3494   `A` becomes part of output mat, `B` is destroyed by this routine. The user cannot use `A` and `B` anymore.
3495 
3496 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MATSEQAIJ`, `MatCreateMPIAIJWithSplitArrays()`
3497 @*/
3498 PetscErrorCode MatCreateMPIAIJWithSeqAIJ(MPI_Comm comm, Mat A, Mat B, const PetscInt garray[], Mat *mat)
3499 {
3500   Mat_MPIAIJ        *maij;
3501   Mat_SeqAIJ        *b  = (Mat_SeqAIJ *)B->data, *bnew;
3502   PetscInt          *oi = b->i, *oj = b->j, i, nz, col;
3503   const PetscScalar *oa;
3504   Mat                Bnew;
3505   PetscInt           m, n, N;
3506   MatType            mpi_mat_type;
3507 
3508   PetscFunctionBegin;
3509   PetscCall(MatCreate(comm, mat));
3510   PetscCall(MatGetSize(A, &m, &n));
3511   PetscCheck(m == B->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Am %" PetscInt_FMT " != Bm %" PetscInt_FMT, m, B->rmap->N);
3512   PetscCheck(PetscAbs(A->rmap->bs) == PetscAbs(B->rmap->bs), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "A row bs %" PetscInt_FMT " != B row bs %" PetscInt_FMT, A->rmap->bs, B->rmap->bs);
3513   /* remove check below; When B is created using iscol_o from ISGetSeqIS_SameColDist_Private(), its bs may not be same as A */
3514   /* PetscCheck(A->cmap->bs == B->cmap->bs,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"A column bs %" PetscInt_FMT " != B column bs %" PetscInt_FMT,A->cmap->bs,B->cmap->bs); */
3515 
3516   /* Get global columns of mat */
3517   PetscCallMPI(MPIU_Allreduce(&n, &N, 1, MPIU_INT, MPI_SUM, comm));
3518 
3519   PetscCall(MatSetSizes(*mat, m, n, PETSC_DECIDE, N));
3520   /* Determine the type of MPI matrix that should be created from the type of matrix A, which holds the "diagonal" portion. */
3521   PetscCall(MatGetMPIMatType_Private(A, &mpi_mat_type));
3522   PetscCall(MatSetType(*mat, mpi_mat_type));
3523 
3524   if (A->rmap->bs > 1 || A->cmap->bs > 1) PetscCall(MatSetBlockSizes(*mat, A->rmap->bs, A->cmap->bs));
3525   maij = (Mat_MPIAIJ *)(*mat)->data;
3526 
3527   (*mat)->preallocated = PETSC_TRUE;
3528 
3529   PetscCall(PetscLayoutSetUp((*mat)->rmap));
3530   PetscCall(PetscLayoutSetUp((*mat)->cmap));
3531 
3532   /* Set A as diagonal portion of *mat */
3533   maij->A = A;
3534 
3535   nz = oi[m];
3536   for (i = 0; i < nz; i++) {
3537     col   = oj[i];
3538     oj[i] = garray[col];
3539   }
3540 
3541   /* Set Bnew as off-diagonal portion of *mat */
3542   PetscCall(MatSeqAIJGetArrayRead(B, &oa));
3543   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, N, oi, oj, (PetscScalar *)oa, &Bnew));
3544   PetscCall(MatSeqAIJRestoreArrayRead(B, &oa));
3545   bnew        = (Mat_SeqAIJ *)Bnew->data;
3546   bnew->maxnz = b->maxnz; /* allocated nonzeros of B */
3547   maij->B     = Bnew;
3548 
3549   PetscCheck(B->rmap->N == Bnew->rmap->N, PETSC_COMM_SELF, PETSC_ERR_PLIB, "BN %" PetscInt_FMT " != BnewN %" PetscInt_FMT, B->rmap->N, Bnew->rmap->N);
3550 
3551   b->free_a  = PETSC_FALSE;
3552   b->free_ij = PETSC_FALSE;
3553   PetscCall(MatDestroy(&B));
3554 
3555   bnew->free_a  = PETSC_TRUE;
3556   bnew->free_ij = PETSC_TRUE;
3557 
3558   /* condense columns of maij->B */
3559   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
3560   PetscCall(MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY));
3561   PetscCall(MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY));
3562   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE));
3563   PetscCall(MatSetOption(*mat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3564   PetscFunctionReturn(PETSC_SUCCESS);
3565 }
3566 
3567 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, PetscBool, Mat *);
3568 
3569 PetscErrorCode MatCreateSubMatrix_MPIAIJ_SameRowDist(Mat mat, IS isrow, IS iscol, IS iscol_local, MatReuse call, Mat *newmat)
3570 {
3571   PetscInt        i, m, n, rstart, row, rend, nz, j, bs, cbs;
3572   PetscInt       *ii, *jj, nlocal, *dlens, *olens, dlen, olen, jend, mglobal;
3573   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)mat->data;
3574   Mat             M, Msub, B = a->B;
3575   MatScalar      *aa;
3576   Mat_SeqAIJ     *aij;
3577   PetscInt       *garray = a->garray, *colsub, Ncols;
3578   PetscInt        count, Bn = B->cmap->N, cstart = mat->cmap->rstart, cend = mat->cmap->rend;
3579   IS              iscol_sub, iscmap;
3580   const PetscInt *is_idx, *cmap;
3581   PetscBool       allcolumns = PETSC_FALSE;
3582   MPI_Comm        comm;
3583 
3584   PetscFunctionBegin;
3585   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3586   if (call == MAT_REUSE_MATRIX) {
3587     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_sub));
3588     PetscCheck(iscol_sub, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "SubIScol passed in was not used before, cannot reuse");
3589     PetscCall(ISGetLocalSize(iscol_sub, &count));
3590 
3591     PetscCall(PetscObjectQuery((PetscObject)*newmat, "Subcmap", (PetscObject *)&iscmap));
3592     PetscCheck(iscmap, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Subcmap passed in was not used before, cannot reuse");
3593 
3594     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubMatrix", (PetscObject *)&Msub));
3595     PetscCheck(Msub, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3596 
3597     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol_sub, MAT_REUSE_MATRIX, PETSC_FALSE, &Msub));
3598 
3599   } else { /* call == MAT_INITIAL_MATRIX) */
3600     PetscBool flg;
3601 
3602     PetscCall(ISGetLocalSize(iscol, &n));
3603     PetscCall(ISGetSize(iscol, &Ncols));
3604 
3605     /* (1) iscol -> nonscalable iscol_local */
3606     /* Check for special case: each processor gets entire matrix columns */
3607     PetscCall(ISIdentity(iscol_local, &flg));
3608     if (flg && n == mat->cmap->N) allcolumns = PETSC_TRUE;
3609     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &allcolumns, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
3610     if (allcolumns) {
3611       iscol_sub = iscol_local;
3612       PetscCall(PetscObjectReference((PetscObject)iscol_local));
3613       PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &iscmap));
3614 
3615     } else {
3616       /* (2) iscol_local -> iscol_sub and iscmap. Implementation below requires iscol_local be sorted, it can have duplicate indices */
3617       PetscInt *idx, *cmap1, k;
3618       PetscCall(PetscMalloc1(Ncols, &idx));
3619       PetscCall(PetscMalloc1(Ncols, &cmap1));
3620       PetscCall(ISGetIndices(iscol_local, &is_idx));
3621       count = 0;
3622       k     = 0;
3623       for (i = 0; i < Ncols; i++) {
3624         j = is_idx[i];
3625         if (j >= cstart && j < cend) {
3626           /* diagonal part of mat */
3627           idx[count]     = j;
3628           cmap1[count++] = i; /* column index in submat */
3629         } else if (Bn) {
3630           /* off-diagonal part of mat */
3631           if (j == garray[k]) {
3632             idx[count]     = j;
3633             cmap1[count++] = i; /* column index in submat */
3634           } else if (j > garray[k]) {
3635             while (j > garray[k] && k < Bn - 1) k++;
3636             if (j == garray[k]) {
3637               idx[count]     = j;
3638               cmap1[count++] = i; /* column index in submat */
3639             }
3640           }
3641         }
3642       }
3643       PetscCall(ISRestoreIndices(iscol_local, &is_idx));
3644 
3645       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, idx, PETSC_OWN_POINTER, &iscol_sub));
3646       PetscCall(ISGetBlockSize(iscol, &cbs));
3647       PetscCall(ISSetBlockSize(iscol_sub, cbs));
3648 
3649       PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)iscol_local), count, cmap1, PETSC_OWN_POINTER, &iscmap));
3650     }
3651 
3652     /* (3) Create sequential Msub */
3653     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol_sub, MAT_INITIAL_MATRIX, allcolumns, &Msub));
3654   }
3655 
3656   PetscCall(ISGetLocalSize(iscol_sub, &count));
3657   aij = (Mat_SeqAIJ *)Msub->data;
3658   ii  = aij->i;
3659   PetscCall(ISGetIndices(iscmap, &cmap));
3660 
3661   /*
3662       m - number of local rows
3663       Ncols - number of columns (same on all processors)
3664       rstart - first row in new global matrix generated
3665   */
3666   PetscCall(MatGetSize(Msub, &m, NULL));
3667 
3668   if (call == MAT_INITIAL_MATRIX) {
3669     /* (4) Create parallel newmat */
3670     PetscMPIInt rank, size;
3671     PetscInt    csize;
3672 
3673     PetscCallMPI(MPI_Comm_size(comm, &size));
3674     PetscCallMPI(MPI_Comm_rank(comm, &rank));
3675 
3676     /*
3677         Determine the number of non-zeros in the diagonal and off-diagonal
3678         portions of the matrix in order to do correct preallocation
3679     */
3680 
3681     /* first get start and end of "diagonal" columns */
3682     PetscCall(ISGetLocalSize(iscol, &csize));
3683     if (csize == PETSC_DECIDE) {
3684       PetscCall(ISGetSize(isrow, &mglobal));
3685       if (mglobal == Ncols) { /* square matrix */
3686         nlocal = m;
3687       } else {
3688         nlocal = Ncols / size + ((Ncols % size) > rank);
3689       }
3690     } else {
3691       nlocal = csize;
3692     }
3693     PetscCallMPI(MPI_Scan(&nlocal, &rend, 1, MPIU_INT, MPI_SUM, comm));
3694     rstart = rend - nlocal;
3695     PetscCheck(rank != size - 1 || rend == Ncols, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Local column sizes %" PetscInt_FMT " do not add up to total number of columns %" PetscInt_FMT, rend, Ncols);
3696 
3697     /* next, compute all the lengths */
3698     jj = aij->j;
3699     PetscCall(PetscMalloc1(2 * m + 1, &dlens));
3700     olens = dlens + m;
3701     for (i = 0; i < m; i++) {
3702       jend = ii[i + 1] - ii[i];
3703       olen = 0;
3704       dlen = 0;
3705       for (j = 0; j < jend; j++) {
3706         if (cmap[*jj] < rstart || cmap[*jj] >= rend) olen++;
3707         else dlen++;
3708         jj++;
3709       }
3710       olens[i] = olen;
3711       dlens[i] = dlen;
3712     }
3713 
3714     PetscCall(ISGetBlockSize(isrow, &bs));
3715     PetscCall(ISGetBlockSize(iscol, &cbs));
3716 
3717     PetscCall(MatCreate(comm, &M));
3718     PetscCall(MatSetSizes(M, m, nlocal, PETSC_DECIDE, Ncols));
3719     PetscCall(MatSetBlockSizes(M, bs, cbs));
3720     PetscCall(MatSetType(M, ((PetscObject)mat)->type_name));
3721     PetscCall(MatMPIAIJSetPreallocation(M, 0, dlens, 0, olens));
3722     PetscCall(PetscFree(dlens));
3723 
3724   } else { /* call == MAT_REUSE_MATRIX */
3725     M = *newmat;
3726     PetscCall(MatGetLocalSize(M, &i, NULL));
3727     PetscCheck(i == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Previous matrix must be same size/layout as request");
3728     PetscCall(MatZeroEntries(M));
3729     /*
3730          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
3731        rather than the slower MatSetValues().
3732     */
3733     M->was_assembled = PETSC_TRUE;
3734     M->assembled     = PETSC_FALSE;
3735   }
3736 
3737   /* (5) Set values of Msub to *newmat */
3738   PetscCall(PetscMalloc1(count, &colsub));
3739   PetscCall(MatGetOwnershipRange(M, &rstart, NULL));
3740 
3741   jj = aij->j;
3742   PetscCall(MatSeqAIJGetArrayRead(Msub, (const PetscScalar **)&aa));
3743   for (i = 0; i < m; i++) {
3744     row = rstart + i;
3745     nz  = ii[i + 1] - ii[i];
3746     for (j = 0; j < nz; j++) colsub[j] = cmap[jj[j]];
3747     PetscCall(MatSetValues_MPIAIJ(M, 1, &row, nz, colsub, aa, INSERT_VALUES));
3748     jj += nz;
3749     aa += nz;
3750   }
3751   PetscCall(MatSeqAIJRestoreArrayRead(Msub, (const PetscScalar **)&aa));
3752   PetscCall(ISRestoreIndices(iscmap, &cmap));
3753 
3754   PetscCall(MatAssemblyBegin(M, MAT_FINAL_ASSEMBLY));
3755   PetscCall(MatAssemblyEnd(M, MAT_FINAL_ASSEMBLY));
3756 
3757   PetscCall(PetscFree(colsub));
3758 
3759   /* save Msub, iscol_sub and iscmap used in processor for next request */
3760   if (call == MAT_INITIAL_MATRIX) {
3761     *newmat = M;
3762     PetscCall(PetscObjectCompose((PetscObject)*newmat, "SubMatrix", (PetscObject)Msub));
3763     PetscCall(MatDestroy(&Msub));
3764 
3765     PetscCall(PetscObjectCompose((PetscObject)*newmat, "SubIScol", (PetscObject)iscol_sub));
3766     PetscCall(ISDestroy(&iscol_sub));
3767 
3768     PetscCall(PetscObjectCompose((PetscObject)*newmat, "Subcmap", (PetscObject)iscmap));
3769     PetscCall(ISDestroy(&iscmap));
3770 
3771     if (iscol_local) {
3772       PetscCall(PetscObjectCompose((PetscObject)*newmat, "ISAllGather", (PetscObject)iscol_local));
3773       PetscCall(ISDestroy(&iscol_local));
3774     }
3775   }
3776   PetscFunctionReturn(PETSC_SUCCESS);
3777 }
3778 
3779 /*
3780     Not great since it makes two copies of the submatrix, first an SeqAIJ
3781   in local and then by concatenating the local matrices the end result.
3782   Writing it directly would be much like MatCreateSubMatrices_MPIAIJ()
3783 
3784   This requires a sequential iscol with all indices.
3785 */
3786 PetscErrorCode MatCreateSubMatrix_MPIAIJ_nonscalable(Mat mat, IS isrow, IS iscol, PetscInt csize, MatReuse call, Mat *newmat)
3787 {
3788   PetscMPIInt rank, size;
3789   PetscInt    i, m, n, rstart, row, rend, nz, *cwork, j, bs, cbs;
3790   PetscInt   *ii, *jj, nlocal, *dlens, *olens, dlen, olen, jend, mglobal;
3791   Mat         M, Mreuse;
3792   MatScalar  *aa, *vwork;
3793   MPI_Comm    comm;
3794   Mat_SeqAIJ *aij;
3795   PetscBool   colflag, allcolumns = PETSC_FALSE;
3796 
3797   PetscFunctionBegin;
3798   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3799   PetscCallMPI(MPI_Comm_rank(comm, &rank));
3800   PetscCallMPI(MPI_Comm_size(comm, &size));
3801 
3802   /* Check for special case: each processor gets entire matrix columns */
3803   PetscCall(ISIdentity(iscol, &colflag));
3804   PetscCall(ISGetLocalSize(iscol, &n));
3805   if (colflag && n == mat->cmap->N) allcolumns = PETSC_TRUE;
3806   PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &allcolumns, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
3807 
3808   if (call == MAT_REUSE_MATRIX) {
3809     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubMatrix", (PetscObject *)&Mreuse));
3810     PetscCheck(Mreuse, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3811     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol, MAT_REUSE_MATRIX, allcolumns, &Mreuse));
3812   } else {
3813     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol, MAT_INITIAL_MATRIX, allcolumns, &Mreuse));
3814   }
3815 
3816   /*
3817       m - number of local rows
3818       n - number of columns (same on all processors)
3819       rstart - first row in new global matrix generated
3820   */
3821   PetscCall(MatGetSize(Mreuse, &m, &n));
3822   PetscCall(MatGetBlockSizes(Mreuse, &bs, &cbs));
3823   if (call == MAT_INITIAL_MATRIX) {
3824     aij = (Mat_SeqAIJ *)Mreuse->data;
3825     ii  = aij->i;
3826     jj  = aij->j;
3827 
3828     /*
3829         Determine the number of non-zeros in the diagonal and off-diagonal
3830         portions of the matrix in order to do correct preallocation
3831     */
3832 
3833     /* first get start and end of "diagonal" columns */
3834     if (csize == PETSC_DECIDE) {
3835       PetscCall(ISGetSize(isrow, &mglobal));
3836       if (mglobal == n) { /* square matrix */
3837         nlocal = m;
3838       } else {
3839         nlocal = n / size + ((n % size) > rank);
3840       }
3841     } else {
3842       nlocal = csize;
3843     }
3844     PetscCallMPI(MPI_Scan(&nlocal, &rend, 1, MPIU_INT, MPI_SUM, comm));
3845     rstart = rend - nlocal;
3846     PetscCheck(rank != size - 1 || rend == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Local column sizes %" PetscInt_FMT " do not add up to total number of columns %" PetscInt_FMT, rend, n);
3847 
3848     /* next, compute all the lengths */
3849     PetscCall(PetscMalloc1(2 * m + 1, &dlens));
3850     olens = dlens + m;
3851     for (i = 0; i < m; i++) {
3852       jend = ii[i + 1] - ii[i];
3853       olen = 0;
3854       dlen = 0;
3855       for (j = 0; j < jend; j++) {
3856         if (*jj < rstart || *jj >= rend) olen++;
3857         else dlen++;
3858         jj++;
3859       }
3860       olens[i] = olen;
3861       dlens[i] = dlen;
3862     }
3863     PetscCall(MatCreate(comm, &M));
3864     PetscCall(MatSetSizes(M, m, nlocal, PETSC_DECIDE, n));
3865     PetscCall(MatSetBlockSizes(M, bs, cbs));
3866     PetscCall(MatSetType(M, ((PetscObject)mat)->type_name));
3867     PetscCall(MatMPIAIJSetPreallocation(M, 0, dlens, 0, olens));
3868     PetscCall(PetscFree(dlens));
3869   } else {
3870     PetscInt ml, nl;
3871 
3872     M = *newmat;
3873     PetscCall(MatGetLocalSize(M, &ml, &nl));
3874     PetscCheck(ml == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Previous matrix must be same size/layout as request");
3875     PetscCall(MatZeroEntries(M));
3876     /*
3877          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
3878        rather than the slower MatSetValues().
3879     */
3880     M->was_assembled = PETSC_TRUE;
3881     M->assembled     = PETSC_FALSE;
3882   }
3883   PetscCall(MatGetOwnershipRange(M, &rstart, &rend));
3884   aij = (Mat_SeqAIJ *)Mreuse->data;
3885   ii  = aij->i;
3886   jj  = aij->j;
3887 
3888   /* trigger copy to CPU if needed */
3889   PetscCall(MatSeqAIJGetArrayRead(Mreuse, (const PetscScalar **)&aa));
3890   for (i = 0; i < m; i++) {
3891     row   = rstart + i;
3892     nz    = ii[i + 1] - ii[i];
3893     cwork = jj;
3894     jj    = PetscSafePointerPlusOffset(jj, nz);
3895     vwork = aa;
3896     aa    = PetscSafePointerPlusOffset(aa, nz);
3897     PetscCall(MatSetValues_MPIAIJ(M, 1, &row, nz, cwork, vwork, INSERT_VALUES));
3898   }
3899   PetscCall(MatSeqAIJRestoreArrayRead(Mreuse, (const PetscScalar **)&aa));
3900 
3901   PetscCall(MatAssemblyBegin(M, MAT_FINAL_ASSEMBLY));
3902   PetscCall(MatAssemblyEnd(M, MAT_FINAL_ASSEMBLY));
3903   *newmat = M;
3904 
3905   /* save submatrix used in processor for next request */
3906   if (call == MAT_INITIAL_MATRIX) {
3907     PetscCall(PetscObjectCompose((PetscObject)M, "SubMatrix", (PetscObject)Mreuse));
3908     PetscCall(MatDestroy(&Mreuse));
3909   }
3910   PetscFunctionReturn(PETSC_SUCCESS);
3911 }
3912 
3913 static PetscErrorCode MatMPIAIJSetPreallocationCSR_MPIAIJ(Mat B, const PetscInt Ii[], const PetscInt J[], const PetscScalar v[])
3914 {
3915   PetscInt        m, cstart, cend, j, nnz, i, d, *ld;
3916   PetscInt       *d_nnz, *o_nnz, nnz_max = 0, rstart, ii, irstart;
3917   const PetscInt *JJ;
3918   PetscBool       nooffprocentries;
3919   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)B->data;
3920 
3921   PetscFunctionBegin;
3922   PetscCall(PetscLayoutSetUp(B->rmap));
3923   PetscCall(PetscLayoutSetUp(B->cmap));
3924   m       = B->rmap->n;
3925   cstart  = B->cmap->rstart;
3926   cend    = B->cmap->rend;
3927   rstart  = B->rmap->rstart;
3928   irstart = Ii[0];
3929 
3930   PetscCall(PetscCalloc2(m, &d_nnz, m, &o_nnz));
3931 
3932   if (PetscDefined(USE_DEBUG)) {
3933     for (i = 0; i < m; i++) {
3934       nnz = Ii[i + 1] - Ii[i];
3935       JJ  = PetscSafePointerPlusOffset(J, Ii[i] - irstart);
3936       PetscCheck(nnz >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Local row %" PetscInt_FMT " has a negative %" PetscInt_FMT " number of columns", i, nnz);
3937       PetscCheck(!nnz || !(JJ[0] < 0), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Row %" PetscInt_FMT " starts with negative column index %" PetscInt_FMT, i, JJ[0]);
3938       PetscCheck(!nnz || !(JJ[nnz - 1] >= B->cmap->N), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Row %" PetscInt_FMT " ends with too large a column index %" PetscInt_FMT " (max allowed %" PetscInt_FMT ")", i, JJ[nnz - 1], B->cmap->N);
3939     }
3940   }
3941 
3942   for (i = 0; i < m; i++) {
3943     nnz     = Ii[i + 1] - Ii[i];
3944     JJ      = PetscSafePointerPlusOffset(J, Ii[i] - irstart);
3945     nnz_max = PetscMax(nnz_max, nnz);
3946     d       = 0;
3947     for (j = 0; j < nnz; j++) {
3948       if (cstart <= JJ[j] && JJ[j] < cend) d++;
3949     }
3950     d_nnz[i] = d;
3951     o_nnz[i] = nnz - d;
3952   }
3953   PetscCall(MatMPIAIJSetPreallocation(B, 0, d_nnz, 0, o_nnz));
3954   PetscCall(PetscFree2(d_nnz, o_nnz));
3955 
3956   for (i = 0; i < m; i++) {
3957     ii = i + rstart;
3958     PetscCall(MatSetValues_MPIAIJ(B, 1, &ii, Ii[i + 1] - Ii[i], PetscSafePointerPlusOffset(J, Ii[i] - irstart), PetscSafePointerPlusOffset(v, Ii[i] - irstart), INSERT_VALUES));
3959   }
3960   nooffprocentries    = B->nooffprocentries;
3961   B->nooffprocentries = PETSC_TRUE;
3962   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
3963   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
3964   B->nooffprocentries = nooffprocentries;
3965 
3966   /* count number of entries below block diagonal */
3967   PetscCall(PetscFree(Aij->ld));
3968   PetscCall(PetscCalloc1(m, &ld));
3969   Aij->ld = ld;
3970   for (i = 0; i < m; i++) {
3971     nnz = Ii[i + 1] - Ii[i];
3972     j   = 0;
3973     while (j < nnz && J[j] < cstart) j++;
3974     ld[i] = j;
3975     if (J) J += nnz;
3976   }
3977 
3978   PetscCall(MatSetOption(B, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3979   PetscFunctionReturn(PETSC_SUCCESS);
3980 }
3981 
3982 /*@
3983   MatMPIAIJSetPreallocationCSR - Allocates memory for a sparse parallel matrix in `MATAIJ` format
3984   (the default parallel PETSc format).
3985 
3986   Collective
3987 
3988   Input Parameters:
3989 + B - the matrix
3990 . i - the indices into `j` for the start of each local row (indices start with zero)
3991 . j - the column indices for each local row (indices start with zero)
3992 - v - optional values in the matrix
3993 
3994   Level: developer
3995 
3996   Notes:
3997   The `i`, `j`, and `v` arrays ARE copied by this routine into the internal format used by PETSc;
3998   thus you CANNOT change the matrix entries by changing the values of `v` after you have
3999   called this routine. Use `MatCreateMPIAIJWithSplitArrays()` to avoid needing to copy the arrays.
4000 
4001   The `i` and `j` indices are 0 based, and `i` indices are indices corresponding to the local `j` array.
4002 
4003   A convenience routine for this functionality is `MatCreateMPIAIJWithArrays()`.
4004 
4005   You can update the matrix with new numerical values using `MatUpdateMPIAIJWithArrays()` after this call if the column indices in `j` are sorted.
4006 
4007   If you do **not** use `MatUpdateMPIAIJWithArrays()`, the column indices in `j` do not need to be sorted. If you will use
4008   `MatUpdateMPIAIJWithArrays()`, the column indices **must** be sorted.
4009 
4010   The format which is used for the sparse matrix input, is equivalent to a
4011   row-major ordering.. i.e for the following matrix, the input data expected is
4012   as shown
4013 .vb
4014         1 0 0
4015         2 0 3     P0
4016        -------
4017         4 5 6     P1
4018 
4019      Process0 [P0] rows_owned=[0,1]
4020         i =  {0,1,3}  [size = nrow+1  = 2+1]
4021         j =  {0,0,2}  [size = 3]
4022         v =  {1,2,3}  [size = 3]
4023 
4024      Process1 [P1] rows_owned=[2]
4025         i =  {0,3}    [size = nrow+1  = 1+1]
4026         j =  {0,1,2}  [size = 3]
4027         v =  {4,5,6}  [size = 3]
4028 .ve
4029 
4030 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatCreateAIJ()`,
4031           `MatCreateSeqAIJWithArrays()`, `MatCreateMPIAIJWithSplitArrays()`, `MatCreateMPIAIJWithArrays()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4032 @*/
4033 PetscErrorCode MatMPIAIJSetPreallocationCSR(Mat B, const PetscInt i[], const PetscInt j[], const PetscScalar v[])
4034 {
4035   PetscFunctionBegin;
4036   PetscTryMethod(B, "MatMPIAIJSetPreallocationCSR_C", (Mat, const PetscInt[], const PetscInt[], const PetscScalar[]), (B, i, j, v));
4037   PetscFunctionReturn(PETSC_SUCCESS);
4038 }
4039 
4040 /*@
4041   MatMPIAIJSetPreallocation - Preallocates memory for a sparse parallel matrix in `MATMPIAIJ` format
4042   (the default parallel PETSc format).  For good matrix assembly performance
4043   the user should preallocate the matrix storage by setting the parameters
4044   `d_nz` (or `d_nnz`) and `o_nz` (or `o_nnz`).
4045 
4046   Collective
4047 
4048   Input Parameters:
4049 + B     - the matrix
4050 . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
4051            (same value is used for all local rows)
4052 . d_nnz - array containing the number of nonzeros in the various rows of the
4053            DIAGONAL portion of the local submatrix (possibly different for each row)
4054            or `NULL` (`PETSC_NULL_INTEGER` in Fortran), if `d_nz` is used to specify the nonzero structure.
4055            The size of this array is equal to the number of local rows, i.e 'm'.
4056            For matrices that will be factored, you must leave room for (and set)
4057            the diagonal entry even if it is zero.
4058 . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
4059            submatrix (same value is used for all local rows).
4060 - o_nnz - array containing the number of nonzeros in the various rows of the
4061            OFF-DIAGONAL portion of the local submatrix (possibly different for
4062            each row) or `NULL` (`PETSC_NULL_INTEGER` in Fortran), if `o_nz` is used to specify the nonzero
4063            structure. The size of this array is equal to the number
4064            of local rows, i.e 'm'.
4065 
4066   Example Usage:
4067   Consider the following 8x8 matrix with 34 non-zero values, that is
4068   assembled across 3 processors. Lets assume that proc0 owns 3 rows,
4069   proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
4070   as follows
4071 
4072 .vb
4073             1  2  0  |  0  3  0  |  0  4
4074     Proc0   0  5  6  |  7  0  0  |  8  0
4075             9  0 10  | 11  0  0  | 12  0
4076     -------------------------------------
4077            13  0 14  | 15 16 17  |  0  0
4078     Proc1   0 18  0  | 19 20 21  |  0  0
4079             0  0  0  | 22 23  0  | 24  0
4080     -------------------------------------
4081     Proc2  25 26 27  |  0  0 28  | 29  0
4082            30  0  0  | 31 32 33  |  0 34
4083 .ve
4084 
4085   This can be represented as a collection of submatrices as
4086 .vb
4087       A B C
4088       D E F
4089       G H I
4090 .ve
4091 
4092   Where the submatrices A,B,C are owned by proc0, D,E,F are
4093   owned by proc1, G,H,I are owned by proc2.
4094 
4095   The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4096   The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4097   The 'M','N' parameters are 8,8, and have the same values on all procs.
4098 
4099   The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
4100   submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
4101   corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
4102   Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
4103   part as `MATSEQAIJ` matrices. For example, proc1 will store [E] as a `MATSEQAIJ`
4104   matrix, ans [DF] as another `MATSEQAIJ` matrix.
4105 
4106   When `d_nz`, `o_nz` parameters are specified, `d_nz` storage elements are
4107   allocated for every row of the local diagonal submatrix, and `o_nz`
4108   storage locations are allocated for every row of the OFF-DIAGONAL submat.
4109   One way to choose `d_nz` and `o_nz` is to use the max nonzerors per local
4110   rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
4111   In this case, the values of `d_nz`, `o_nz` are
4112 .vb
4113      proc0  dnz = 2, o_nz = 2
4114      proc1  dnz = 3, o_nz = 2
4115      proc2  dnz = 1, o_nz = 4
4116 .ve
4117   We are allocating `m`*(`d_nz`+`o_nz`) storage locations for every proc. This
4118   translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
4119   for proc3. i.e we are using 12+15+10=37 storage locations to store
4120   34 values.
4121 
4122   When `d_nnz`, `o_nnz` parameters are specified, the storage is specified
4123   for every row, corresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
4124   In the above case the values for `d_nnz`, `o_nnz` are
4125 .vb
4126      proc0 d_nnz = [2,2,2] and o_nnz = [2,2,2]
4127      proc1 d_nnz = [3,3,2] and o_nnz = [2,1,1]
4128      proc2 d_nnz = [1,1]   and o_nnz = [4,4]
4129 .ve
4130   Here the space allocated is sum of all the above values i.e 34, and
4131   hence pre-allocation is perfect.
4132 
4133   Level: intermediate
4134 
4135   Notes:
4136   If the *_nnz parameter is given then the *_nz parameter is ignored
4137 
4138   The `MATAIJ` format, also called compressed row storage (CSR), is compatible with standard Fortran
4139   storage.  The stored row and column indices begin with zero.
4140   See [Sparse Matrices](sec_matsparse) for details.
4141 
4142   The parallel matrix is partitioned such that the first m0 rows belong to
4143   process 0, the next m1 rows belong to process 1, the next m2 rows belong
4144   to process 2 etc.. where m0,m1,m2... are the input parameter 'm'.
4145 
4146   The DIAGONAL portion of the local submatrix of a processor can be defined
4147   as the submatrix which is obtained by extraction the part corresponding to
4148   the rows r1-r2 and columns c1-c2 of the global matrix, where r1 is the
4149   first row that belongs to the processor, r2 is the last row belonging to
4150   the this processor, and c1-c2 is range of indices of the local part of a
4151   vector suitable for applying the matrix to.  This is an mxn matrix.  In the
4152   common case of a square matrix, the row and column ranges are the same and
4153   the DIAGONAL part is also square. The remaining portion of the local
4154   submatrix (mxN) constitute the OFF-DIAGONAL portion.
4155 
4156   If `o_nnz` and `d_nnz` are specified, then `o_nz` and `d_nz` are ignored.
4157 
4158   You can call `MatGetInfo()` to get information on how effective the preallocation was;
4159   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
4160   You can also run with the option `-info` and look for messages with the string
4161   malloc in them to see if additional memory allocation was needed.
4162 
4163 .seealso: [](ch_matrices), `Mat`, [Sparse Matrices](sec_matsparse), `MATMPIAIJ`, `MATAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateAIJ()`, `MatMPIAIJSetPreallocationCSR()`,
4164           `MatGetInfo()`, `PetscSplitOwnership()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4165 @*/
4166 PetscErrorCode MatMPIAIJSetPreallocation(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
4167 {
4168   PetscFunctionBegin;
4169   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
4170   PetscValidType(B, 1);
4171   PetscTryMethod(B, "MatMPIAIJSetPreallocation_C", (Mat, PetscInt, const PetscInt[], PetscInt, const PetscInt[]), (B, d_nz, d_nnz, o_nz, o_nnz));
4172   PetscFunctionReturn(PETSC_SUCCESS);
4173 }
4174 
4175 /*@
4176   MatCreateMPIAIJWithArrays - creates a `MATMPIAIJ` matrix using arrays that contain in standard
4177   CSR format for the local rows.
4178 
4179   Collective
4180 
4181   Input Parameters:
4182 + comm - MPI communicator
4183 . m    - number of local rows (Cannot be `PETSC_DECIDE`)
4184 . n    - This value should be the same as the local size used in creating the
4185          x vector for the matrix-vector product $ y = Ax$. (or `PETSC_DECIDE` to have
4186          calculated if `N` is given) For square matrices n is almost always `m`.
4187 . M    - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
4188 . N    - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
4189 . i    - row indices (of length m+1); that is i[0] = 0, i[row] = i[row-1] + number of elements in that row of the matrix
4190 . j    - global column indices
4191 - a    - optional matrix values
4192 
4193   Output Parameter:
4194 . mat - the matrix
4195 
4196   Level: intermediate
4197 
4198   Notes:
4199   The `i`, `j`, and `a` arrays ARE copied by this routine into the internal format used by PETSc;
4200   thus you CANNOT change the matrix entries by changing the values of `a[]` after you have
4201   called this routine. Use `MatCreateMPIAIJWithSplitArrays()` to avoid needing to copy the arrays.
4202 
4203   The `i` and `j` indices are 0 based, and `i` indices are indices corresponding to the local `j` array.
4204 
4205   Once you have created the matrix you can update it with new numerical values using `MatUpdateMPIAIJWithArray()`
4206 
4207   If you do **not** use `MatUpdateMPIAIJWithArray()`, the column indices in `j` do not need to be sorted. If you will use
4208   `MatUpdateMPIAIJWithArrays()`, the column indices **must** be sorted.
4209 
4210   The format which is used for the sparse matrix input, is equivalent to a
4211   row-major ordering, i.e., for the following matrix, the input data expected is
4212   as shown
4213 .vb
4214         1 0 0
4215         2 0 3     P0
4216        -------
4217         4 5 6     P1
4218 
4219      Process0 [P0] rows_owned=[0,1]
4220         i =  {0,1,3}  [size = nrow+1  = 2+1]
4221         j =  {0,0,2}  [size = 3]
4222         v =  {1,2,3}  [size = 3]
4223 
4224      Process1 [P1] rows_owned=[2]
4225         i =  {0,3}    [size = nrow+1  = 1+1]
4226         j =  {0,1,2}  [size = 3]
4227         v =  {4,5,6}  [size = 3]
4228 .ve
4229 
4230 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4231           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArray()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4232 @*/
4233 PetscErrorCode MatCreateMPIAIJWithArrays(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, const PetscInt i[], const PetscInt j[], const PetscScalar a[], Mat *mat)
4234 {
4235   PetscFunctionBegin;
4236   PetscCheck(!i || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
4237   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
4238   PetscCall(MatCreate(comm, mat));
4239   PetscCall(MatSetSizes(*mat, m, n, M, N));
4240   /* PetscCall(MatSetBlockSizes(M,bs,cbs)); */
4241   PetscCall(MatSetType(*mat, MATMPIAIJ));
4242   PetscCall(MatMPIAIJSetPreallocationCSR(*mat, i, j, a));
4243   PetscFunctionReturn(PETSC_SUCCESS);
4244 }
4245 
4246 /*@
4247   MatUpdateMPIAIJWithArrays - updates a `MATMPIAIJ` matrix using arrays that contain in standard
4248   CSR format for the local rows. Only the numerical values are updated the other arrays must be identical to what was passed
4249   from `MatCreateMPIAIJWithArrays()`
4250 
4251   Deprecated: Use `MatUpdateMPIAIJWithArray()`
4252 
4253   Collective
4254 
4255   Input Parameters:
4256 + mat - the matrix
4257 . m   - number of local rows (Cannot be `PETSC_DECIDE`)
4258 . n   - This value should be the same as the local size used in creating the
4259        x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
4260        calculated if N is given) For square matrices n is almost always m.
4261 . M   - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
4262 . N   - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
4263 . Ii  - row indices; that is Ii[0] = 0, Ii[row] = Ii[row-1] + number of elements in that row of the matrix
4264 . J   - column indices
4265 - v   - matrix values
4266 
4267   Level: deprecated
4268 
4269 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4270           `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArray()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4271 @*/
4272 PetscErrorCode MatUpdateMPIAIJWithArrays(Mat mat, PetscInt m, PetscInt n, PetscInt M, PetscInt N, const PetscInt Ii[], const PetscInt J[], const PetscScalar v[])
4273 {
4274   PetscInt        nnz, i;
4275   PetscBool       nooffprocentries;
4276   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)mat->data;
4277   Mat_SeqAIJ     *Ad  = (Mat_SeqAIJ *)Aij->A->data;
4278   PetscScalar    *ad, *ao;
4279   PetscInt        ldi, Iii, md;
4280   const PetscInt *Adi = Ad->i;
4281   PetscInt       *ld  = Aij->ld;
4282 
4283   PetscFunctionBegin;
4284   PetscCheck(Ii[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
4285   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
4286   PetscCheck(m == mat->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Local number of rows cannot change from call to MatUpdateMPIAIJWithArrays()");
4287   PetscCheck(n == mat->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Local number of columns cannot change from call to MatUpdateMPIAIJWithArrays()");
4288 
4289   PetscCall(MatSeqAIJGetArrayWrite(Aij->A, &ad));
4290   PetscCall(MatSeqAIJGetArrayWrite(Aij->B, &ao));
4291 
4292   for (i = 0; i < m; i++) {
4293     if (PetscDefined(USE_DEBUG)) {
4294       for (PetscInt j = Ii[i] + 1; j < Ii[i + 1]; ++j) {
4295         PetscCheck(J[j] >= J[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column entry number %" PetscInt_FMT " (actual column %" PetscInt_FMT ") in row %" PetscInt_FMT " is not sorted", j - Ii[i], J[j], i);
4296         PetscCheck(J[j] != J[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column entry number %" PetscInt_FMT " (actual column %" PetscInt_FMT ") in row %" PetscInt_FMT " is identical to previous entry", j - Ii[i], J[j], i);
4297       }
4298     }
4299     nnz = Ii[i + 1] - Ii[i];
4300     Iii = Ii[i];
4301     ldi = ld[i];
4302     md  = Adi[i + 1] - Adi[i];
4303     PetscCall(PetscArraycpy(ao, v + Iii, ldi));
4304     PetscCall(PetscArraycpy(ad, v + Iii + ldi, md));
4305     PetscCall(PetscArraycpy(ao + ldi, v + Iii + ldi + md, nnz - ldi - md));
4306     ad += md;
4307     ao += nnz - md;
4308   }
4309   nooffprocentries      = mat->nooffprocentries;
4310   mat->nooffprocentries = PETSC_TRUE;
4311   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->A, &ad));
4312   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->B, &ao));
4313   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->A));
4314   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->B));
4315   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
4316   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
4317   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
4318   mat->nooffprocentries = nooffprocentries;
4319   PetscFunctionReturn(PETSC_SUCCESS);
4320 }
4321 
4322 /*@
4323   MatUpdateMPIAIJWithArray - updates an `MATMPIAIJ` matrix using an array that contains the nonzero values
4324 
4325   Collective
4326 
4327   Input Parameters:
4328 + mat - the matrix
4329 - v   - matrix values, stored by row
4330 
4331   Level: intermediate
4332 
4333   Notes:
4334   The matrix must have been obtained with `MatCreateMPIAIJWithArrays()` or `MatMPIAIJSetPreallocationCSR()`
4335 
4336   The column indices in the call to `MatCreateMPIAIJWithArrays()` or `MatMPIAIJSetPreallocationCSR()` must have been sorted for this call to work correctly
4337 
4338 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4339           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArrays()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4340 @*/
4341 PetscErrorCode MatUpdateMPIAIJWithArray(Mat mat, const PetscScalar v[])
4342 {
4343   PetscInt        nnz, i, m;
4344   PetscBool       nooffprocentries;
4345   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)mat->data;
4346   Mat_SeqAIJ     *Ad  = (Mat_SeqAIJ *)Aij->A->data;
4347   Mat_SeqAIJ     *Ao  = (Mat_SeqAIJ *)Aij->B->data;
4348   PetscScalar    *ad, *ao;
4349   const PetscInt *Adi = Ad->i, *Adj = Ao->i;
4350   PetscInt        ldi, Iii, md;
4351   PetscInt       *ld = Aij->ld;
4352 
4353   PetscFunctionBegin;
4354   m = mat->rmap->n;
4355 
4356   PetscCall(MatSeqAIJGetArrayWrite(Aij->A, &ad));
4357   PetscCall(MatSeqAIJGetArrayWrite(Aij->B, &ao));
4358   Iii = 0;
4359   for (i = 0; i < m; i++) {
4360     nnz = Adi[i + 1] - Adi[i] + Adj[i + 1] - Adj[i];
4361     ldi = ld[i];
4362     md  = Adi[i + 1] - Adi[i];
4363     PetscCall(PetscArraycpy(ad, v + Iii + ldi, md));
4364     ad += md;
4365     if (ao) {
4366       PetscCall(PetscArraycpy(ao, v + Iii, ldi));
4367       PetscCall(PetscArraycpy(ao + ldi, v + Iii + ldi + md, nnz - ldi - md));
4368       ao += nnz - md;
4369     }
4370     Iii += nnz;
4371   }
4372   nooffprocentries      = mat->nooffprocentries;
4373   mat->nooffprocentries = PETSC_TRUE;
4374   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->A, &ad));
4375   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->B, &ao));
4376   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->A));
4377   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->B));
4378   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
4379   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
4380   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
4381   mat->nooffprocentries = nooffprocentries;
4382   PetscFunctionReturn(PETSC_SUCCESS);
4383 }
4384 
4385 /*@
4386   MatCreateAIJ - Creates a sparse parallel matrix in `MATAIJ` format
4387   (the default parallel PETSc format).  For good matrix assembly performance
4388   the user should preallocate the matrix storage by setting the parameters
4389   `d_nz` (or `d_nnz`) and `o_nz` (or `o_nnz`).
4390 
4391   Collective
4392 
4393   Input Parameters:
4394 + comm  - MPI communicator
4395 . m     - number of local rows (or `PETSC_DECIDE` to have calculated if M is given)
4396           This value should be the same as the local size used in creating the
4397           y vector for the matrix-vector product y = Ax.
4398 . n     - This value should be the same as the local size used in creating the
4399           x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
4400           calculated if N is given) For square matrices n is almost always m.
4401 . M     - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
4402 . N     - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
4403 . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
4404           (same value is used for all local rows)
4405 . d_nnz - array containing the number of nonzeros in the various rows of the
4406           DIAGONAL portion of the local submatrix (possibly different for each row)
4407           or `NULL`, if `d_nz` is used to specify the nonzero structure.
4408           The size of this array is equal to the number of local rows, i.e 'm'.
4409 . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
4410           submatrix (same value is used for all local rows).
4411 - o_nnz - array containing the number of nonzeros in the various rows of the
4412           OFF-DIAGONAL portion of the local submatrix (possibly different for
4413           each row) or `NULL`, if `o_nz` is used to specify the nonzero
4414           structure. The size of this array is equal to the number
4415           of local rows, i.e 'm'.
4416 
4417   Output Parameter:
4418 . A - the matrix
4419 
4420   Options Database Keys:
4421 + -mat_no_inode                     - Do not use inodes
4422 . -mat_inode_limit <limit>          - Sets inode limit (max limit=5)
4423 - -matmult_vecscatter_view <viewer> - View the vecscatter (i.e., communication pattern) used in `MatMult()` of sparse parallel matrices.
4424                                       See viewer types in manual of `MatView()`. Of them, ascii_matlab, draw or binary cause the `VecScatter`
4425                                       to be viewed as a matrix. Entry (i,j) is the size of message (in bytes) rank i sends to rank j in one `MatMult()` call.
4426 
4427   Level: intermediate
4428 
4429   Notes:
4430   It is recommended that one use `MatCreateFromOptions()` or the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
4431   MatXXXXSetPreallocation() paradigm instead of this routine directly.
4432   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
4433 
4434   If the *_nnz parameter is given then the *_nz parameter is ignored
4435 
4436   The `m`,`n`,`M`,`N` parameters specify the size of the matrix, and its partitioning across
4437   processors, while `d_nz`,`d_nnz`,`o_nz`,`o_nnz` parameters specify the approximate
4438   storage requirements for this matrix.
4439 
4440   If `PETSC_DECIDE` or  `PETSC_DETERMINE` is used for a particular argument on one
4441   processor than it must be used on all processors that share the object for
4442   that argument.
4443 
4444   If `m` and `n` are not `PETSC_DECIDE`, then the values determine the `PetscLayout` of the matrix and the ranges returned by
4445   `MatGetOwnershipRange()`, `MatGetOwnershipRanges()`, `MatGetOwnershipRangeColumn()`, and `MatGetOwnershipRangesColumn()`.
4446 
4447   The user MUST specify either the local or global matrix dimensions
4448   (possibly both).
4449 
4450   The parallel matrix is partitioned across processors such that the
4451   first `m0` rows belong to process 0, the next `m1` rows belong to
4452   process 1, the next `m2` rows belong to process 2, etc., where
4453   `m0`, `m1`, `m2`... are the input parameter `m` on each MPI process. I.e., each MPI process stores
4454   values corresponding to [m x N] submatrix.
4455 
4456   The columns are logically partitioned with the n0 columns belonging
4457   to 0th partition, the next n1 columns belonging to the next
4458   partition etc.. where n0,n1,n2... are the input parameter 'n'.
4459 
4460   The DIAGONAL portion of the local submatrix on any given processor
4461   is the submatrix corresponding to the rows and columns m,n
4462   corresponding to the given processor. i.e diagonal matrix on
4463   process 0 is [m0 x n0], diagonal matrix on process 1 is [m1 x n1]
4464   etc. The remaining portion of the local submatrix [m x (N-n)]
4465   constitute the OFF-DIAGONAL portion. The example below better
4466   illustrates this concept.
4467 
4468   For a square global matrix we define each processor's diagonal portion
4469   to be its local rows and the corresponding columns (a square submatrix);
4470   each processor's off-diagonal portion encompasses the remainder of the
4471   local matrix (a rectangular submatrix).
4472 
4473   If `o_nnz`, `d_nnz` are specified, then `o_nz`, and `d_nz` are ignored.
4474 
4475   When calling this routine with a single process communicator, a matrix of
4476   type `MATSEQAIJ` is returned.  If a matrix of type `MATMPIAIJ` is desired for this
4477   type of communicator, use the construction mechanism
4478 .vb
4479   MatCreate(..., &A);
4480   MatSetType(A, MATMPIAIJ);
4481   MatSetSizes(A, m, n, M, N);
4482   MatMPIAIJSetPreallocation(A, ...);
4483 .ve
4484 
4485   By default, this format uses inodes (identical nodes) when possible.
4486   We search for consecutive rows with the same nonzero structure, thereby
4487   reusing matrix information to achieve increased efficiency.
4488 
4489   Example Usage:
4490   Consider the following 8x8 matrix with 34 non-zero values, that is
4491   assembled across 3 processors. Lets assume that proc0 owns 3 rows,
4492   proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
4493   as follows
4494 
4495 .vb
4496             1  2  0  |  0  3  0  |  0  4
4497     Proc0   0  5  6  |  7  0  0  |  8  0
4498             9  0 10  | 11  0  0  | 12  0
4499     -------------------------------------
4500            13  0 14  | 15 16 17  |  0  0
4501     Proc1   0 18  0  | 19 20 21  |  0  0
4502             0  0  0  | 22 23  0  | 24  0
4503     -------------------------------------
4504     Proc2  25 26 27  |  0  0 28  | 29  0
4505            30  0  0  | 31 32 33  |  0 34
4506 .ve
4507 
4508   This can be represented as a collection of submatrices as
4509 
4510 .vb
4511       A B C
4512       D E F
4513       G H I
4514 .ve
4515 
4516   Where the submatrices A,B,C are owned by proc0, D,E,F are
4517   owned by proc1, G,H,I are owned by proc2.
4518 
4519   The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4520   The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4521   The 'M','N' parameters are 8,8, and have the same values on all procs.
4522 
4523   The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
4524   submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
4525   corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
4526   Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
4527   part as `MATSEQAIJ` matrices. For example, proc1 will store [E] as a `MATSEQAIJ`
4528   matrix, ans [DF] as another SeqAIJ matrix.
4529 
4530   When `d_nz`, `o_nz` parameters are specified, `d_nz` storage elements are
4531   allocated for every row of the local diagonal submatrix, and `o_nz`
4532   storage locations are allocated for every row of the OFF-DIAGONAL submat.
4533   One way to choose `d_nz` and `o_nz` is to use the max nonzerors per local
4534   rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
4535   In this case, the values of `d_nz`,`o_nz` are
4536 .vb
4537      proc0  dnz = 2, o_nz = 2
4538      proc1  dnz = 3, o_nz = 2
4539      proc2  dnz = 1, o_nz = 4
4540 .ve
4541   We are allocating m*(`d_nz`+`o_nz`) storage locations for every proc. This
4542   translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
4543   for proc3. i.e we are using 12+15+10=37 storage locations to store
4544   34 values.
4545 
4546   When `d_nnz`, `o_nnz` parameters are specified, the storage is specified
4547   for every row, corresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
4548   In the above case the values for d_nnz,o_nnz are
4549 .vb
4550      proc0 d_nnz = [2,2,2] and o_nnz = [2,2,2]
4551      proc1 d_nnz = [3,3,2] and o_nnz = [2,1,1]
4552      proc2 d_nnz = [1,1]   and o_nnz = [4,4]
4553 .ve
4554   Here the space allocated is sum of all the above values i.e 34, and
4555   hence pre-allocation is perfect.
4556 
4557 .seealso: [](ch_matrices), `Mat`, [Sparse Matrix Creation](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4558           `MATMPIAIJ`, `MatCreateMPIAIJWithArrays()`, `MatGetOwnershipRange()`, `MatGetOwnershipRanges()`, `MatGetOwnershipRangeColumn()`,
4559           `MatGetOwnershipRangesColumn()`, `PetscLayout`
4560 @*/
4561 PetscErrorCode MatCreateAIJ(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
4562 {
4563   PetscMPIInt size;
4564 
4565   PetscFunctionBegin;
4566   PetscCall(MatCreate(comm, A));
4567   PetscCall(MatSetSizes(*A, m, n, M, N));
4568   PetscCallMPI(MPI_Comm_size(comm, &size));
4569   if (size > 1) {
4570     PetscCall(MatSetType(*A, MATMPIAIJ));
4571     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
4572   } else {
4573     PetscCall(MatSetType(*A, MATSEQAIJ));
4574     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
4575   }
4576   PetscFunctionReturn(PETSC_SUCCESS);
4577 }
4578 
4579 /*MC
4580     MatMPIAIJGetSeqAIJF90 - Returns the local pieces of this distributed matrix
4581 
4582     Synopsis:
4583     MatMPIAIJGetSeqAIJF90(Mat A, Mat Ad, Mat Ao, {PetscInt, pointer :: colmap(:)},integer ierr)
4584 
4585     Not Collective
4586 
4587     Input Parameter:
4588 .   A - the `MATMPIAIJ` matrix
4589 
4590     Output Parameters:
4591 +   Ad - the diagonal portion of the matrix
4592 .   Ao - the off-diagonal portion of the matrix
4593 .   colmap - An array mapping local column numbers of `Ao` to global column numbers of the parallel matrix
4594 -   ierr - error code
4595 
4596      Level: advanced
4597 
4598     Note:
4599     Use  `MatMPIAIJRestoreSeqAIJF90()` when you no longer need access to the matrices and `colmap`
4600 
4601 .seealso: [](ch_matrices), `Mat`, [](sec_fortranarrays), `Mat`, `MATMPIAIJ`, `MatMPIAIJGetSeqAIJ()`, `MatMPIAIJRestoreSeqAIJF90()`
4602 M*/
4603 
4604 /*MC
4605     MatMPIAIJRestoreSeqAIJF90 - call after `MatMPIAIJGetSeqAIJF90()` when you no longer need access to the matrices and `colmap`
4606 
4607     Synopsis:
4608     MatMPIAIJRestoreSeqAIJF90(Mat A, Mat Ad, Mat Ao, {PetscInt, pointer :: colmap(:)},integer ierr)
4609 
4610     Not Collective
4611 
4612     Input Parameters:
4613 +   A - the `MATMPIAIJ` matrix
4614 .   Ad - the diagonal portion of the matrix
4615 .   Ao - the off-diagonal portion of the matrix
4616 .   colmap - An array mapping local column numbers of `Ao` to global column numbers of the parallel matrix
4617 -   ierr - error code
4618 
4619      Level: advanced
4620 
4621 .seealso: [](ch_matrices), `Mat`, [](sec_fortranarrays), `Mat`, `MATMPIAIJ`, `MatMPIAIJGetSeqAIJ()`, `MatMPIAIJGetSeqAIJF90()`
4622 M*/
4623 
4624 /*@C
4625   MatMPIAIJGetSeqAIJ - Returns the local pieces of this distributed matrix
4626 
4627   Not Collective
4628 
4629   Input Parameter:
4630 . A - The `MATMPIAIJ` matrix
4631 
4632   Output Parameters:
4633 + Ad     - The local diagonal block as a `MATSEQAIJ` matrix
4634 . Ao     - The local off-diagonal block as a `MATSEQAIJ` matrix
4635 - colmap - An array mapping local column numbers of `Ao` to global column numbers of the parallel matrix
4636 
4637   Level: intermediate
4638 
4639   Note:
4640   The rows in `Ad` and `Ao` are in [0, Nr), where Nr is the number of local rows on this process. The columns
4641   in `Ad` are in [0, Nc) where Nc is the number of local columns. The columns are `Ao` are in [0, Nco), where Nco is
4642   the number of nonzero columns in the local off-diagonal piece of the matrix `A`. The array colmap maps these
4643   local column numbers to global column numbers in the original matrix.
4644 
4645   Fortran Notes:
4646   `MatMPIAIJGetSeqAIJ()` Fortran binding is deprecated (since PETSc 3.19), use `MatMPIAIJGetSeqAIJF90()`
4647 
4648 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatMPIAIJGetSeqAIJF90()`, `MatMPIAIJRestoreSeqAIJF90()`, `MatMPIAIJGetLocalMat()`, `MatMPIAIJGetLocalMatCondensed()`, `MatCreateAIJ()`, `MATSEQAIJ`
4649 @*/
4650 PetscErrorCode MatMPIAIJGetSeqAIJ(Mat A, Mat *Ad, Mat *Ao, const PetscInt *colmap[])
4651 {
4652   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
4653   PetscBool   flg;
4654 
4655   PetscFunctionBegin;
4656   PetscCall(PetscStrbeginswith(((PetscObject)A)->type_name, MATMPIAIJ, &flg));
4657   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "This function requires a MATMPIAIJ matrix as input");
4658   if (Ad) *Ad = a->A;
4659   if (Ao) *Ao = a->B;
4660   if (colmap) *colmap = a->garray;
4661   PetscFunctionReturn(PETSC_SUCCESS);
4662 }
4663 
4664 PetscErrorCode MatCreateMPIMatConcatenateSeqMat_MPIAIJ(MPI_Comm comm, Mat inmat, PetscInt n, MatReuse scall, Mat *outmat)
4665 {
4666   PetscInt     m, N, i, rstart, nnz, Ii;
4667   PetscInt    *indx;
4668   PetscScalar *values;
4669   MatType      rootType;
4670 
4671   PetscFunctionBegin;
4672   PetscCall(MatGetSize(inmat, &m, &N));
4673   if (scall == MAT_INITIAL_MATRIX) { /* symbolic phase */
4674     PetscInt *dnz, *onz, sum, bs, cbs;
4675 
4676     if (n == PETSC_DECIDE) PetscCall(PetscSplitOwnership(comm, &n, &N));
4677     /* Check sum(n) = N */
4678     PetscCallMPI(MPIU_Allreduce(&n, &sum, 1, MPIU_INT, MPI_SUM, comm));
4679     PetscCheck(sum == N, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Sum of local columns %" PetscInt_FMT " != global columns %" PetscInt_FMT, sum, N);
4680 
4681     PetscCallMPI(MPI_Scan(&m, &rstart, 1, MPIU_INT, MPI_SUM, comm));
4682     rstart -= m;
4683 
4684     MatPreallocateBegin(comm, m, n, dnz, onz);
4685     for (i = 0; i < m; i++) {
4686       PetscCall(MatGetRow_SeqAIJ(inmat, i, &nnz, &indx, NULL));
4687       PetscCall(MatPreallocateSet(i + rstart, nnz, indx, dnz, onz));
4688       PetscCall(MatRestoreRow_SeqAIJ(inmat, i, &nnz, &indx, NULL));
4689     }
4690 
4691     PetscCall(MatCreate(comm, outmat));
4692     PetscCall(MatSetSizes(*outmat, m, n, PETSC_DETERMINE, PETSC_DETERMINE));
4693     PetscCall(MatGetBlockSizes(inmat, &bs, &cbs));
4694     PetscCall(MatSetBlockSizes(*outmat, bs, cbs));
4695     PetscCall(MatGetRootType_Private(inmat, &rootType));
4696     PetscCall(MatSetType(*outmat, rootType));
4697     PetscCall(MatSeqAIJSetPreallocation(*outmat, 0, dnz));
4698     PetscCall(MatMPIAIJSetPreallocation(*outmat, 0, dnz, 0, onz));
4699     MatPreallocateEnd(dnz, onz);
4700     PetscCall(MatSetOption(*outmat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
4701   }
4702 
4703   /* numeric phase */
4704   PetscCall(MatGetOwnershipRange(*outmat, &rstart, NULL));
4705   for (i = 0; i < m; i++) {
4706     PetscCall(MatGetRow_SeqAIJ(inmat, i, &nnz, &indx, &values));
4707     Ii = i + rstart;
4708     PetscCall(MatSetValues(*outmat, 1, &Ii, nnz, indx, values, INSERT_VALUES));
4709     PetscCall(MatRestoreRow_SeqAIJ(inmat, i, &nnz, &indx, &values));
4710   }
4711   PetscCall(MatAssemblyBegin(*outmat, MAT_FINAL_ASSEMBLY));
4712   PetscCall(MatAssemblyEnd(*outmat, MAT_FINAL_ASSEMBLY));
4713   PetscFunctionReturn(PETSC_SUCCESS);
4714 }
4715 
4716 static PetscErrorCode MatDestroy_MPIAIJ_SeqsToMPI(void **data)
4717 {
4718   Mat_Merge_SeqsToMPI *merge = (Mat_Merge_SeqsToMPI *)*data;
4719 
4720   PetscFunctionBegin;
4721   if (!merge) PetscFunctionReturn(PETSC_SUCCESS);
4722   PetscCall(PetscFree(merge->id_r));
4723   PetscCall(PetscFree(merge->len_s));
4724   PetscCall(PetscFree(merge->len_r));
4725   PetscCall(PetscFree(merge->bi));
4726   PetscCall(PetscFree(merge->bj));
4727   PetscCall(PetscFree(merge->buf_ri[0]));
4728   PetscCall(PetscFree(merge->buf_ri));
4729   PetscCall(PetscFree(merge->buf_rj[0]));
4730   PetscCall(PetscFree(merge->buf_rj));
4731   PetscCall(PetscFree(merge->coi));
4732   PetscCall(PetscFree(merge->coj));
4733   PetscCall(PetscFree(merge->owners_co));
4734   PetscCall(PetscLayoutDestroy(&merge->rowmap));
4735   PetscCall(PetscFree(merge));
4736   PetscFunctionReturn(PETSC_SUCCESS);
4737 }
4738 
4739 #include <../src/mat/utils/freespace.h>
4740 #include <petscbt.h>
4741 
4742 PetscErrorCode MatCreateMPIAIJSumSeqAIJNumeric(Mat seqmat, Mat mpimat)
4743 {
4744   MPI_Comm             comm;
4745   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)seqmat->data;
4746   PetscMPIInt          size, rank, taga, *len_s;
4747   PetscInt             N = mpimat->cmap->N, i, j, *owners, *ai = a->i, *aj, m;
4748   PetscMPIInt          proc, k;
4749   PetscInt           **buf_ri, **buf_rj;
4750   PetscInt             anzi, *bj_i, *bi, *bj, arow, bnzi, nextaj;
4751   PetscInt             nrows, **buf_ri_k, **nextrow, **nextai;
4752   MPI_Request         *s_waits, *r_waits;
4753   MPI_Status          *status;
4754   const MatScalar     *aa, *a_a;
4755   MatScalar          **abuf_r, *ba_i;
4756   Mat_Merge_SeqsToMPI *merge;
4757   PetscContainer       container;
4758 
4759   PetscFunctionBegin;
4760   PetscCall(PetscObjectGetComm((PetscObject)mpimat, &comm));
4761   PetscCall(PetscLogEventBegin(MAT_Seqstompinum, seqmat, 0, 0, 0));
4762 
4763   PetscCallMPI(MPI_Comm_size(comm, &size));
4764   PetscCallMPI(MPI_Comm_rank(comm, &rank));
4765 
4766   PetscCall(PetscObjectQuery((PetscObject)mpimat, "MatMergeSeqsToMPI", (PetscObject *)&container));
4767   PetscCheck(container, PetscObjectComm((PetscObject)mpimat), PETSC_ERR_PLIB, "Mat not created from MatCreateMPIAIJSumSeqAIJSymbolic");
4768   PetscCall(PetscContainerGetPointer(container, (void **)&merge));
4769   PetscCall(MatSeqAIJGetArrayRead(seqmat, &a_a));
4770   aa = a_a;
4771 
4772   bi     = merge->bi;
4773   bj     = merge->bj;
4774   buf_ri = merge->buf_ri;
4775   buf_rj = merge->buf_rj;
4776 
4777   PetscCall(PetscMalloc1(size, &status));
4778   owners = merge->rowmap->range;
4779   len_s  = merge->len_s;
4780 
4781   /* send and recv matrix values */
4782   PetscCall(PetscObjectGetNewTag((PetscObject)mpimat, &taga));
4783   PetscCall(PetscPostIrecvScalar(comm, taga, merge->nrecv, merge->id_r, merge->len_r, &abuf_r, &r_waits));
4784 
4785   PetscCall(PetscMalloc1(merge->nsend + 1, &s_waits));
4786   for (proc = 0, k = 0; proc < size; proc++) {
4787     if (!len_s[proc]) continue;
4788     i = owners[proc];
4789     PetscCallMPI(MPIU_Isend(aa + ai[i], len_s[proc], MPIU_MATSCALAR, proc, taga, comm, s_waits + k));
4790     k++;
4791   }
4792 
4793   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, r_waits, status));
4794   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, s_waits, status));
4795   PetscCall(PetscFree(status));
4796 
4797   PetscCall(PetscFree(s_waits));
4798   PetscCall(PetscFree(r_waits));
4799 
4800   /* insert mat values of mpimat */
4801   PetscCall(PetscMalloc1(N, &ba_i));
4802   PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextai));
4803 
4804   for (k = 0; k < merge->nrecv; k++) {
4805     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
4806     nrows       = *buf_ri_k[k];
4807     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
4808     nextai[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
4809   }
4810 
4811   /* set values of ba */
4812   m = merge->rowmap->n;
4813   for (i = 0; i < m; i++) {
4814     arow = owners[rank] + i;
4815     bj_i = bj + bi[i]; /* col indices of the i-th row of mpimat */
4816     bnzi = bi[i + 1] - bi[i];
4817     PetscCall(PetscArrayzero(ba_i, bnzi));
4818 
4819     /* add local non-zero vals of this proc's seqmat into ba */
4820     anzi   = ai[arow + 1] - ai[arow];
4821     aj     = a->j + ai[arow];
4822     aa     = a_a + ai[arow];
4823     nextaj = 0;
4824     for (j = 0; nextaj < anzi; j++) {
4825       if (*(bj_i + j) == aj[nextaj]) { /* bcol == acol */
4826         ba_i[j] += aa[nextaj++];
4827       }
4828     }
4829 
4830     /* add received vals into ba */
4831     for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
4832       /* i-th row */
4833       if (i == *nextrow[k]) {
4834         anzi   = *(nextai[k] + 1) - *nextai[k];
4835         aj     = buf_rj[k] + *nextai[k];
4836         aa     = abuf_r[k] + *nextai[k];
4837         nextaj = 0;
4838         for (j = 0; nextaj < anzi; j++) {
4839           if (*(bj_i + j) == aj[nextaj]) { /* bcol == acol */
4840             ba_i[j] += aa[nextaj++];
4841           }
4842         }
4843         nextrow[k]++;
4844         nextai[k]++;
4845       }
4846     }
4847     PetscCall(MatSetValues(mpimat, 1, &arow, bnzi, bj_i, ba_i, INSERT_VALUES));
4848   }
4849   PetscCall(MatSeqAIJRestoreArrayRead(seqmat, &a_a));
4850   PetscCall(MatAssemblyBegin(mpimat, MAT_FINAL_ASSEMBLY));
4851   PetscCall(MatAssemblyEnd(mpimat, MAT_FINAL_ASSEMBLY));
4852 
4853   PetscCall(PetscFree(abuf_r[0]));
4854   PetscCall(PetscFree(abuf_r));
4855   PetscCall(PetscFree(ba_i));
4856   PetscCall(PetscFree3(buf_ri_k, nextrow, nextai));
4857   PetscCall(PetscLogEventEnd(MAT_Seqstompinum, seqmat, 0, 0, 0));
4858   PetscFunctionReturn(PETSC_SUCCESS);
4859 }
4860 
4861 PetscErrorCode MatCreateMPIAIJSumSeqAIJSymbolic(MPI_Comm comm, Mat seqmat, PetscInt m, PetscInt n, Mat *mpimat)
4862 {
4863   Mat                  B_mpi;
4864   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)seqmat->data;
4865   PetscMPIInt          size, rank, tagi, tagj, *len_s, *len_si, *len_ri;
4866   PetscInt           **buf_rj, **buf_ri, **buf_ri_k;
4867   PetscInt             M = seqmat->rmap->n, N = seqmat->cmap->n, i, *owners, *ai = a->i, *aj = a->j;
4868   PetscInt             len, *dnz, *onz, bs, cbs;
4869   PetscInt             k, anzi, *bi, *bj, *lnk, nlnk, arow, bnzi;
4870   PetscInt             nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextai;
4871   MPI_Request         *si_waits, *sj_waits, *ri_waits, *rj_waits;
4872   MPI_Status          *status;
4873   PetscFreeSpaceList   free_space = NULL, current_space = NULL;
4874   PetscBT              lnkbt;
4875   Mat_Merge_SeqsToMPI *merge;
4876   PetscContainer       container;
4877 
4878   PetscFunctionBegin;
4879   PetscCall(PetscLogEventBegin(MAT_Seqstompisym, seqmat, 0, 0, 0));
4880 
4881   /* make sure it is a PETSc comm */
4882   PetscCall(PetscCommDuplicate(comm, &comm, NULL));
4883   PetscCallMPI(MPI_Comm_size(comm, &size));
4884   PetscCallMPI(MPI_Comm_rank(comm, &rank));
4885 
4886   PetscCall(PetscNew(&merge));
4887   PetscCall(PetscMalloc1(size, &status));
4888 
4889   /* determine row ownership */
4890   PetscCall(PetscLayoutCreate(comm, &merge->rowmap));
4891   PetscCall(PetscLayoutSetLocalSize(merge->rowmap, m));
4892   PetscCall(PetscLayoutSetSize(merge->rowmap, M));
4893   PetscCall(PetscLayoutSetBlockSize(merge->rowmap, 1));
4894   PetscCall(PetscLayoutSetUp(merge->rowmap));
4895   PetscCall(PetscMalloc1(size, &len_si));
4896   PetscCall(PetscMalloc1(size, &merge->len_s));
4897 
4898   m      = merge->rowmap->n;
4899   owners = merge->rowmap->range;
4900 
4901   /* determine the number of messages to send, their lengths */
4902   len_s = merge->len_s;
4903 
4904   len          = 0; /* length of buf_si[] */
4905   merge->nsend = 0;
4906   for (PetscMPIInt proc = 0; proc < size; proc++) {
4907     len_si[proc] = 0;
4908     if (proc == rank) {
4909       len_s[proc] = 0;
4910     } else {
4911       PetscCall(PetscMPIIntCast(owners[proc + 1] - owners[proc] + 1, &len_si[proc]));
4912       PetscCall(PetscMPIIntCast(ai[owners[proc + 1]] - ai[owners[proc]], &len_s[proc])); /* num of rows to be sent to [proc] */
4913     }
4914     if (len_s[proc]) {
4915       merge->nsend++;
4916       nrows = 0;
4917       for (i = owners[proc]; i < owners[proc + 1]; i++) {
4918         if (ai[i + 1] > ai[i]) nrows++;
4919       }
4920       PetscCall(PetscMPIIntCast(2 * (nrows + 1), &len_si[proc]));
4921       len += len_si[proc];
4922     }
4923   }
4924 
4925   /* determine the number and length of messages to receive for ij-structure */
4926   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &merge->nrecv));
4927   PetscCall(PetscGatherMessageLengths2(comm, merge->nsend, merge->nrecv, len_s, len_si, &merge->id_r, &merge->len_r, &len_ri));
4928 
4929   /* post the Irecv of j-structure */
4930   PetscCall(PetscCommGetNewTag(comm, &tagj));
4931   PetscCall(PetscPostIrecvInt(comm, tagj, merge->nrecv, merge->id_r, merge->len_r, &buf_rj, &rj_waits));
4932 
4933   /* post the Isend of j-structure */
4934   PetscCall(PetscMalloc2(merge->nsend, &si_waits, merge->nsend, &sj_waits));
4935 
4936   for (PetscMPIInt proc = 0, k = 0; proc < size; proc++) {
4937     if (!len_s[proc]) continue;
4938     i = owners[proc];
4939     PetscCallMPI(MPIU_Isend(aj + ai[i], len_s[proc], MPIU_INT, proc, tagj, comm, sj_waits + k));
4940     k++;
4941   }
4942 
4943   /* receives and sends of j-structure are complete */
4944   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, rj_waits, status));
4945   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, sj_waits, status));
4946 
4947   /* send and recv i-structure */
4948   PetscCall(PetscCommGetNewTag(comm, &tagi));
4949   PetscCall(PetscPostIrecvInt(comm, tagi, merge->nrecv, merge->id_r, len_ri, &buf_ri, &ri_waits));
4950 
4951   PetscCall(PetscMalloc1(len + 1, &buf_s));
4952   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
4953   for (PetscMPIInt proc = 0, k = 0; proc < size; proc++) {
4954     if (!len_s[proc]) continue;
4955     /* form outgoing message for i-structure:
4956          buf_si[0]:                 nrows to be sent
4957                [1:nrows]:           row index (global)
4958                [nrows+1:2*nrows+1]: i-structure index
4959     */
4960     nrows       = len_si[proc] / 2 - 1;
4961     buf_si_i    = buf_si + nrows + 1;
4962     buf_si[0]   = nrows;
4963     buf_si_i[0] = 0;
4964     nrows       = 0;
4965     for (i = owners[proc]; i < owners[proc + 1]; i++) {
4966       anzi = ai[i + 1] - ai[i];
4967       if (anzi) {
4968         buf_si_i[nrows + 1] = buf_si_i[nrows] + anzi; /* i-structure */
4969         buf_si[nrows + 1]   = i - owners[proc];       /* local row index */
4970         nrows++;
4971       }
4972     }
4973     PetscCallMPI(MPIU_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, si_waits + k));
4974     k++;
4975     buf_si += len_si[proc];
4976   }
4977 
4978   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, ri_waits, status));
4979   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, si_waits, status));
4980 
4981   PetscCall(PetscInfo(seqmat, "nsend: %d, nrecv: %d\n", merge->nsend, merge->nrecv));
4982   for (i = 0; i < merge->nrecv; i++) PetscCall(PetscInfo(seqmat, "recv len_ri=%d, len_rj=%d from [%d]\n", len_ri[i], merge->len_r[i], merge->id_r[i]));
4983 
4984   PetscCall(PetscFree(len_si));
4985   PetscCall(PetscFree(len_ri));
4986   PetscCall(PetscFree(rj_waits));
4987   PetscCall(PetscFree2(si_waits, sj_waits));
4988   PetscCall(PetscFree(ri_waits));
4989   PetscCall(PetscFree(buf_s));
4990   PetscCall(PetscFree(status));
4991 
4992   /* compute a local seq matrix in each processor */
4993   /* allocate bi array and free space for accumulating nonzero column info */
4994   PetscCall(PetscMalloc1(m + 1, &bi));
4995   bi[0] = 0;
4996 
4997   /* create and initialize a linked list */
4998   nlnk = N + 1;
4999   PetscCall(PetscLLCreate(N, N, nlnk, lnk, lnkbt));
5000 
5001   /* initial FreeSpace size is 2*(num of local nnz(seqmat)) */
5002   len = ai[owners[rank + 1]] - ai[owners[rank]];
5003   PetscCall(PetscFreeSpaceGet(PetscIntMultTruncate(2, len) + 1, &free_space));
5004 
5005   current_space = free_space;
5006 
5007   /* determine symbolic info for each local row */
5008   PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextai));
5009 
5010   for (k = 0; k < merge->nrecv; k++) {
5011     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
5012     nrows       = *buf_ri_k[k];
5013     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
5014     nextai[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
5015   }
5016 
5017   MatPreallocateBegin(comm, m, n, dnz, onz);
5018   len = 0;
5019   for (i = 0; i < m; i++) {
5020     bnzi = 0;
5021     /* add local non-zero cols of this proc's seqmat into lnk */
5022     arow = owners[rank] + i;
5023     anzi = ai[arow + 1] - ai[arow];
5024     aj   = a->j + ai[arow];
5025     PetscCall(PetscLLAddSorted(anzi, aj, N, &nlnk, lnk, lnkbt));
5026     bnzi += nlnk;
5027     /* add received col data into lnk */
5028     for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
5029       if (i == *nextrow[k]) {            /* i-th row */
5030         anzi = *(nextai[k] + 1) - *nextai[k];
5031         aj   = buf_rj[k] + *nextai[k];
5032         PetscCall(PetscLLAddSorted(anzi, aj, N, &nlnk, lnk, lnkbt));
5033         bnzi += nlnk;
5034         nextrow[k]++;
5035         nextai[k]++;
5036       }
5037     }
5038     if (len < bnzi) len = bnzi; /* =max(bnzi) */
5039 
5040     /* if free space is not available, make more free space */
5041     if (current_space->local_remaining < bnzi) PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(bnzi, current_space->total_array_size), &current_space));
5042     /* copy data into free space, then initialize lnk */
5043     PetscCall(PetscLLClean(N, N, bnzi, lnk, current_space->array, lnkbt));
5044     PetscCall(MatPreallocateSet(i + owners[rank], bnzi, current_space->array, dnz, onz));
5045 
5046     current_space->array += bnzi;
5047     current_space->local_used += bnzi;
5048     current_space->local_remaining -= bnzi;
5049 
5050     bi[i + 1] = bi[i] + bnzi;
5051   }
5052 
5053   PetscCall(PetscFree3(buf_ri_k, nextrow, nextai));
5054 
5055   PetscCall(PetscMalloc1(bi[m] + 1, &bj));
5056   PetscCall(PetscFreeSpaceContiguous(&free_space, bj));
5057   PetscCall(PetscLLDestroy(lnk, lnkbt));
5058 
5059   /* create symbolic parallel matrix B_mpi */
5060   PetscCall(MatGetBlockSizes(seqmat, &bs, &cbs));
5061   PetscCall(MatCreate(comm, &B_mpi));
5062   if (n == PETSC_DECIDE) {
5063     PetscCall(MatSetSizes(B_mpi, m, n, PETSC_DETERMINE, N));
5064   } else {
5065     PetscCall(MatSetSizes(B_mpi, m, n, PETSC_DETERMINE, PETSC_DETERMINE));
5066   }
5067   PetscCall(MatSetBlockSizes(B_mpi, bs, cbs));
5068   PetscCall(MatSetType(B_mpi, MATMPIAIJ));
5069   PetscCall(MatMPIAIJSetPreallocation(B_mpi, 0, dnz, 0, onz));
5070   MatPreallocateEnd(dnz, onz);
5071   PetscCall(MatSetOption(B_mpi, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE));
5072 
5073   /* B_mpi is not ready for use - assembly will be done by MatCreateMPIAIJSumSeqAIJNumeric() */
5074   B_mpi->assembled = PETSC_FALSE;
5075   merge->bi        = bi;
5076   merge->bj        = bj;
5077   merge->buf_ri    = buf_ri;
5078   merge->buf_rj    = buf_rj;
5079   merge->coi       = NULL;
5080   merge->coj       = NULL;
5081   merge->owners_co = NULL;
5082 
5083   PetscCall(PetscCommDestroy(&comm));
5084 
5085   /* attach the supporting struct to B_mpi for reuse */
5086   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container));
5087   PetscCall(PetscContainerSetPointer(container, merge));
5088   PetscCall(PetscContainerSetCtxDestroy(container, MatDestroy_MPIAIJ_SeqsToMPI));
5089   PetscCall(PetscObjectCompose((PetscObject)B_mpi, "MatMergeSeqsToMPI", (PetscObject)container));
5090   PetscCall(PetscContainerDestroy(&container));
5091   *mpimat = B_mpi;
5092 
5093   PetscCall(PetscLogEventEnd(MAT_Seqstompisym, seqmat, 0, 0, 0));
5094   PetscFunctionReturn(PETSC_SUCCESS);
5095 }
5096 
5097 /*@
5098   MatCreateMPIAIJSumSeqAIJ - Creates a `MATMPIAIJ` matrix by adding sequential
5099   matrices from each processor
5100 
5101   Collective
5102 
5103   Input Parameters:
5104 + comm   - the communicators the parallel matrix will live on
5105 . seqmat - the input sequential matrices
5106 . m      - number of local rows (or `PETSC_DECIDE`)
5107 . n      - number of local columns (or `PETSC_DECIDE`)
5108 - scall  - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5109 
5110   Output Parameter:
5111 . mpimat - the parallel matrix generated
5112 
5113   Level: advanced
5114 
5115   Note:
5116   The dimensions of the sequential matrix in each processor MUST be the same.
5117   The input seqmat is included into the container "Mat_Merge_SeqsToMPI", and will be
5118   destroyed when `mpimat` is destroyed. Call `PetscObjectQuery()` to access `seqmat`.
5119 
5120 .seealso: [](ch_matrices), `Mat`, `MatCreateAIJ()`
5121 @*/
5122 PetscErrorCode MatCreateMPIAIJSumSeqAIJ(MPI_Comm comm, Mat seqmat, PetscInt m, PetscInt n, MatReuse scall, Mat *mpimat)
5123 {
5124   PetscMPIInt size;
5125 
5126   PetscFunctionBegin;
5127   PetscCallMPI(MPI_Comm_size(comm, &size));
5128   if (size == 1) {
5129     PetscCall(PetscLogEventBegin(MAT_Seqstompi, seqmat, 0, 0, 0));
5130     if (scall == MAT_INITIAL_MATRIX) {
5131       PetscCall(MatDuplicate(seqmat, MAT_COPY_VALUES, mpimat));
5132     } else {
5133       PetscCall(MatCopy(seqmat, *mpimat, SAME_NONZERO_PATTERN));
5134     }
5135     PetscCall(PetscLogEventEnd(MAT_Seqstompi, seqmat, 0, 0, 0));
5136     PetscFunctionReturn(PETSC_SUCCESS);
5137   }
5138   PetscCall(PetscLogEventBegin(MAT_Seqstompi, seqmat, 0, 0, 0));
5139   if (scall == MAT_INITIAL_MATRIX) PetscCall(MatCreateMPIAIJSumSeqAIJSymbolic(comm, seqmat, m, n, mpimat));
5140   PetscCall(MatCreateMPIAIJSumSeqAIJNumeric(seqmat, *mpimat));
5141   PetscCall(PetscLogEventEnd(MAT_Seqstompi, seqmat, 0, 0, 0));
5142   PetscFunctionReturn(PETSC_SUCCESS);
5143 }
5144 
5145 /*@
5146   MatAIJGetLocalMat - Creates a `MATSEQAIJ` from a `MATAIJ` matrix.
5147 
5148   Not Collective
5149 
5150   Input Parameter:
5151 . A - the matrix
5152 
5153   Output Parameter:
5154 . A_loc - the local sequential matrix generated
5155 
5156   Level: developer
5157 
5158   Notes:
5159   The matrix is created by taking `A`'s local rows and putting them into a sequential matrix
5160   with `mlocal` rows and `n` columns. Where `mlocal` is obtained with `MatGetLocalSize()` and
5161   `n` is the global column count obtained with `MatGetSize()`
5162 
5163   In other words combines the two parts of a parallel `MATMPIAIJ` matrix on each process to a single matrix.
5164 
5165   For parallel matrices this creates an entirely new matrix. If the matrix is sequential it merely increases the reference count.
5166 
5167   Destroy the matrix with `MatDestroy()`
5168 
5169 .seealso: [](ch_matrices), `Mat`, `MatMPIAIJGetLocalMat()`
5170 @*/
5171 PetscErrorCode MatAIJGetLocalMat(Mat A, Mat *A_loc)
5172 {
5173   PetscBool mpi;
5174 
5175   PetscFunctionBegin;
5176   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIAIJ, &mpi));
5177   if (mpi) {
5178     PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, A_loc));
5179   } else {
5180     *A_loc = A;
5181     PetscCall(PetscObjectReference((PetscObject)*A_loc));
5182   }
5183   PetscFunctionReturn(PETSC_SUCCESS);
5184 }
5185 
5186 /*@
5187   MatMPIAIJGetLocalMat - Creates a `MATSEQAIJ` from a `MATMPIAIJ` matrix.
5188 
5189   Not Collective
5190 
5191   Input Parameters:
5192 + A     - the matrix
5193 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5194 
5195   Output Parameter:
5196 . A_loc - the local sequential matrix generated
5197 
5198   Level: developer
5199 
5200   Notes:
5201   The matrix is created by taking all `A`'s local rows and putting them into a sequential
5202   matrix with `mlocal` rows and `n` columns.`mlocal` is the row count obtained with
5203   `MatGetLocalSize()` and `n` is the global column count obtained with `MatGetSize()`.
5204 
5205   In other words combines the two parts of a parallel `MATMPIAIJ` matrix on each process to a single matrix.
5206 
5207   When `A` is sequential and `MAT_INITIAL_MATRIX` is requested, the matrix returned is the diagonal part of `A` (which contains the entire matrix),
5208   with its reference count increased by one. Hence changing values of `A_loc` changes `A`. If `MAT_REUSE_MATRIX` is requested on a sequential matrix
5209   then `MatCopy`(Adiag,*`A_loc`,`SAME_NONZERO_PATTERN`) is called to fill `A_loc`. Thus one can preallocate the appropriate sequential matrix `A_loc`
5210   and then call this routine with `MAT_REUSE_MATRIX`. In this case, one can modify the values of `A_loc` without affecting the original sequential matrix.
5211 
5212 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMatCondensed()`, `MatMPIAIJGetLocalMatMerge()`
5213 @*/
5214 PetscErrorCode MatMPIAIJGetLocalMat(Mat A, MatReuse scall, Mat *A_loc)
5215 {
5216   Mat_MPIAIJ        *mpimat = (Mat_MPIAIJ *)A->data;
5217   Mat_SeqAIJ        *mat, *a, *b;
5218   PetscInt          *ai, *aj, *bi, *bj, *cmap = mpimat->garray;
5219   const PetscScalar *aa, *ba, *aav, *bav;
5220   PetscScalar       *ca, *cam;
5221   PetscMPIInt        size;
5222   PetscInt           am = A->rmap->n, i, j, k, cstart = A->cmap->rstart;
5223   PetscInt          *ci, *cj, col, ncols_d, ncols_o, jo;
5224   PetscBool          match;
5225 
5226   PetscFunctionBegin;
5227   PetscCall(PetscStrbeginswith(((PetscObject)A)->type_name, MATMPIAIJ, &match));
5228   PetscCheck(match, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Requires MATMPIAIJ matrix as input");
5229   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
5230   if (size == 1) {
5231     if (scall == MAT_INITIAL_MATRIX) {
5232       PetscCall(PetscObjectReference((PetscObject)mpimat->A));
5233       *A_loc = mpimat->A;
5234     } else if (scall == MAT_REUSE_MATRIX) {
5235       PetscCall(MatCopy(mpimat->A, *A_loc, SAME_NONZERO_PATTERN));
5236     }
5237     PetscFunctionReturn(PETSC_SUCCESS);
5238   }
5239 
5240   PetscCall(PetscLogEventBegin(MAT_Getlocalmat, A, 0, 0, 0));
5241   a  = (Mat_SeqAIJ *)mpimat->A->data;
5242   b  = (Mat_SeqAIJ *)mpimat->B->data;
5243   ai = a->i;
5244   aj = a->j;
5245   bi = b->i;
5246   bj = b->j;
5247   PetscCall(MatSeqAIJGetArrayRead(mpimat->A, &aav));
5248   PetscCall(MatSeqAIJGetArrayRead(mpimat->B, &bav));
5249   aa = aav;
5250   ba = bav;
5251   if (scall == MAT_INITIAL_MATRIX) {
5252     PetscCall(PetscMalloc1(1 + am, &ci));
5253     ci[0] = 0;
5254     for (i = 0; i < am; i++) ci[i + 1] = ci[i] + (ai[i + 1] - ai[i]) + (bi[i + 1] - bi[i]);
5255     PetscCall(PetscMalloc1(1 + ci[am], &cj));
5256     PetscCall(PetscMalloc1(1 + ci[am], &ca));
5257     k = 0;
5258     for (i = 0; i < am; i++) {
5259       ncols_o = bi[i + 1] - bi[i];
5260       ncols_d = ai[i + 1] - ai[i];
5261       /* off-diagonal portion of A */
5262       for (jo = 0; jo < ncols_o; jo++) {
5263         col = cmap[*bj];
5264         if (col >= cstart) break;
5265         cj[k] = col;
5266         bj++;
5267         ca[k++] = *ba++;
5268       }
5269       /* diagonal portion of A */
5270       for (j = 0; j < ncols_d; j++) {
5271         cj[k]   = cstart + *aj++;
5272         ca[k++] = *aa++;
5273       }
5274       /* off-diagonal portion of A */
5275       for (j = jo; j < ncols_o; j++) {
5276         cj[k]   = cmap[*bj++];
5277         ca[k++] = *ba++;
5278       }
5279     }
5280     /* put together the new matrix */
5281     PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, A->cmap->N, ci, cj, ca, A_loc));
5282     /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5283     /* Since these are PETSc arrays, change flags to free them as necessary. */
5284     mat          = (Mat_SeqAIJ *)(*A_loc)->data;
5285     mat->free_a  = PETSC_TRUE;
5286     mat->free_ij = PETSC_TRUE;
5287     mat->nonew   = 0;
5288   } else if (scall == MAT_REUSE_MATRIX) {
5289     mat = (Mat_SeqAIJ *)(*A_loc)->data;
5290     ci  = mat->i;
5291     cj  = mat->j;
5292     PetscCall(MatSeqAIJGetArrayWrite(*A_loc, &cam));
5293     for (i = 0; i < am; i++) {
5294       /* off-diagonal portion of A */
5295       ncols_o = bi[i + 1] - bi[i];
5296       for (jo = 0; jo < ncols_o; jo++) {
5297         col = cmap[*bj];
5298         if (col >= cstart) break;
5299         *cam++ = *ba++;
5300         bj++;
5301       }
5302       /* diagonal portion of A */
5303       ncols_d = ai[i + 1] - ai[i];
5304       for (j = 0; j < ncols_d; j++) *cam++ = *aa++;
5305       /* off-diagonal portion of A */
5306       for (j = jo; j < ncols_o; j++) {
5307         *cam++ = *ba++;
5308         bj++;
5309       }
5310     }
5311     PetscCall(MatSeqAIJRestoreArrayWrite(*A_loc, &cam));
5312   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid MatReuse %d", (int)scall);
5313   PetscCall(MatSeqAIJRestoreArrayRead(mpimat->A, &aav));
5314   PetscCall(MatSeqAIJRestoreArrayRead(mpimat->B, &bav));
5315   PetscCall(PetscLogEventEnd(MAT_Getlocalmat, A, 0, 0, 0));
5316   PetscFunctionReturn(PETSC_SUCCESS);
5317 }
5318 
5319 /*@
5320   MatMPIAIJGetLocalMatMerge - Creates a `MATSEQAIJ` from a `MATMPIAIJ` matrix by taking all its local rows and putting them into a sequential matrix with
5321   mlocal rows and n columns. Where n is the sum of the number of columns of the diagonal and off-diagonal part
5322 
5323   Not Collective
5324 
5325   Input Parameters:
5326 + A     - the matrix
5327 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5328 
5329   Output Parameters:
5330 + glob  - sequential `IS` with global indices associated with the columns of the local sequential matrix generated (can be `NULL`)
5331 - A_loc - the local sequential matrix generated
5332 
5333   Level: developer
5334 
5335   Note:
5336   This is different from `MatMPIAIJGetLocalMat()` since the first columns in the returning matrix are those associated with the diagonal
5337   part, then those associated with the off-diagonal part (in its local ordering)
5338 
5339 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMat()`, `MatMPIAIJGetLocalMatCondensed()`
5340 @*/
5341 PetscErrorCode MatMPIAIJGetLocalMatMerge(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
5342 {
5343   Mat             Ao, Ad;
5344   const PetscInt *cmap;
5345   PetscMPIInt     size;
5346   PetscErrorCode (*f)(Mat, MatReuse, IS *, Mat *);
5347 
5348   PetscFunctionBegin;
5349   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
5350   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
5351   if (size == 1) {
5352     if (scall == MAT_INITIAL_MATRIX) {
5353       PetscCall(PetscObjectReference((PetscObject)Ad));
5354       *A_loc = Ad;
5355     } else if (scall == MAT_REUSE_MATRIX) {
5356       PetscCall(MatCopy(Ad, *A_loc, SAME_NONZERO_PATTERN));
5357     }
5358     if (glob) PetscCall(ISCreateStride(PetscObjectComm((PetscObject)Ad), Ad->cmap->n, Ad->cmap->rstart, 1, glob));
5359     PetscFunctionReturn(PETSC_SUCCESS);
5360   }
5361   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", &f));
5362   PetscCall(PetscLogEventBegin(MAT_Getlocalmat, A, 0, 0, 0));
5363   if (f) {
5364     PetscCall((*f)(A, scall, glob, A_loc));
5365   } else {
5366     Mat_SeqAIJ        *a = (Mat_SeqAIJ *)Ad->data;
5367     Mat_SeqAIJ        *b = (Mat_SeqAIJ *)Ao->data;
5368     Mat_SeqAIJ        *c;
5369     PetscInt          *ai = a->i, *aj = a->j;
5370     PetscInt          *bi = b->i, *bj = b->j;
5371     PetscInt          *ci, *cj;
5372     const PetscScalar *aa, *ba;
5373     PetscScalar       *ca;
5374     PetscInt           i, j, am, dn, on;
5375 
5376     PetscCall(MatGetLocalSize(Ad, &am, &dn));
5377     PetscCall(MatGetLocalSize(Ao, NULL, &on));
5378     PetscCall(MatSeqAIJGetArrayRead(Ad, &aa));
5379     PetscCall(MatSeqAIJGetArrayRead(Ao, &ba));
5380     if (scall == MAT_INITIAL_MATRIX) {
5381       PetscInt k;
5382       PetscCall(PetscMalloc1(1 + am, &ci));
5383       PetscCall(PetscMalloc1(ai[am] + bi[am], &cj));
5384       PetscCall(PetscMalloc1(ai[am] + bi[am], &ca));
5385       ci[0] = 0;
5386       for (i = 0, k = 0; i < am; i++) {
5387         const PetscInt ncols_o = bi[i + 1] - bi[i];
5388         const PetscInt ncols_d = ai[i + 1] - ai[i];
5389         ci[i + 1]              = ci[i] + ncols_o + ncols_d;
5390         /* diagonal portion of A */
5391         for (j = 0; j < ncols_d; j++, k++) {
5392           cj[k] = *aj++;
5393           ca[k] = *aa++;
5394         }
5395         /* off-diagonal portion of A */
5396         for (j = 0; j < ncols_o; j++, k++) {
5397           cj[k] = dn + *bj++;
5398           ca[k] = *ba++;
5399         }
5400       }
5401       /* put together the new matrix */
5402       PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, dn + on, ci, cj, ca, A_loc));
5403       /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5404       /* Since these are PETSc arrays, change flags to free them as necessary. */
5405       c          = (Mat_SeqAIJ *)(*A_loc)->data;
5406       c->free_a  = PETSC_TRUE;
5407       c->free_ij = PETSC_TRUE;
5408       c->nonew   = 0;
5409       PetscCall(MatSetType(*A_loc, ((PetscObject)Ad)->type_name));
5410     } else if (scall == MAT_REUSE_MATRIX) {
5411       PetscCall(MatSeqAIJGetArrayWrite(*A_loc, &ca));
5412       for (i = 0; i < am; i++) {
5413         const PetscInt ncols_d = ai[i + 1] - ai[i];
5414         const PetscInt ncols_o = bi[i + 1] - bi[i];
5415         /* diagonal portion of A */
5416         for (j = 0; j < ncols_d; j++) *ca++ = *aa++;
5417         /* off-diagonal portion of A */
5418         for (j = 0; j < ncols_o; j++) *ca++ = *ba++;
5419       }
5420       PetscCall(MatSeqAIJRestoreArrayWrite(*A_loc, &ca));
5421     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid MatReuse %d", (int)scall);
5422     PetscCall(MatSeqAIJRestoreArrayRead(Ad, &aa));
5423     PetscCall(MatSeqAIJRestoreArrayRead(Ao, &aa));
5424     if (glob) {
5425       PetscInt cst, *gidx;
5426 
5427       PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
5428       PetscCall(PetscMalloc1(dn + on, &gidx));
5429       for (i = 0; i < dn; i++) gidx[i] = cst + i;
5430       for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
5431       PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
5432     }
5433   }
5434   PetscCall(PetscLogEventEnd(MAT_Getlocalmat, A, 0, 0, 0));
5435   PetscFunctionReturn(PETSC_SUCCESS);
5436 }
5437 
5438 /*@C
5439   MatMPIAIJGetLocalMatCondensed - Creates a `MATSEQAIJ` matrix from an `MATMPIAIJ` matrix by taking all its local rows and NON-ZERO columns
5440 
5441   Not Collective
5442 
5443   Input Parameters:
5444 + A     - the matrix
5445 . scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5446 . row   - index set of rows to extract (or `NULL`)
5447 - col   - index set of columns to extract (or `NULL`)
5448 
5449   Output Parameter:
5450 . A_loc - the local sequential matrix generated
5451 
5452   Level: developer
5453 
5454 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMat()`
5455 @*/
5456 PetscErrorCode MatMPIAIJGetLocalMatCondensed(Mat A, MatReuse scall, IS *row, IS *col, Mat *A_loc)
5457 {
5458   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
5459   PetscInt    i, start, end, ncols, nzA, nzB, *cmap, imark, *idx;
5460   IS          isrowa, iscola;
5461   Mat        *aloc;
5462   PetscBool   match;
5463 
5464   PetscFunctionBegin;
5465   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIAIJ, &match));
5466   PetscCheck(match, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Requires MATMPIAIJ matrix as input");
5467   PetscCall(PetscLogEventBegin(MAT_Getlocalmatcondensed, A, 0, 0, 0));
5468   if (!row) {
5469     start = A->rmap->rstart;
5470     end   = A->rmap->rend;
5471     PetscCall(ISCreateStride(PETSC_COMM_SELF, end - start, start, 1, &isrowa));
5472   } else {
5473     isrowa = *row;
5474   }
5475   if (!col) {
5476     start = A->cmap->rstart;
5477     cmap  = a->garray;
5478     nzA   = a->A->cmap->n;
5479     nzB   = a->B->cmap->n;
5480     PetscCall(PetscMalloc1(nzA + nzB, &idx));
5481     ncols = 0;
5482     for (i = 0; i < nzB; i++) {
5483       if (cmap[i] < start) idx[ncols++] = cmap[i];
5484       else break;
5485     }
5486     imark = i;
5487     for (i = 0; i < nzA; i++) idx[ncols++] = start + i;
5488     for (i = imark; i < nzB; i++) idx[ncols++] = cmap[i];
5489     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, &iscola));
5490   } else {
5491     iscola = *col;
5492   }
5493   if (scall != MAT_INITIAL_MATRIX) {
5494     PetscCall(PetscMalloc1(1, &aloc));
5495     aloc[0] = *A_loc;
5496   }
5497   PetscCall(MatCreateSubMatrices(A, 1, &isrowa, &iscola, scall, &aloc));
5498   if (!col) { /* attach global id of condensed columns */
5499     PetscCall(PetscObjectCompose((PetscObject)aloc[0], "_petsc_GetLocalMatCondensed_iscol", (PetscObject)iscola));
5500   }
5501   *A_loc = aloc[0];
5502   PetscCall(PetscFree(aloc));
5503   if (!row) PetscCall(ISDestroy(&isrowa));
5504   if (!col) PetscCall(ISDestroy(&iscola));
5505   PetscCall(PetscLogEventEnd(MAT_Getlocalmatcondensed, A, 0, 0, 0));
5506   PetscFunctionReturn(PETSC_SUCCESS);
5507 }
5508 
5509 /*
5510  * Create a sequential AIJ matrix based on row indices. a whole column is extracted once a row is matched.
5511  * Row could be local or remote.The routine is designed to be scalable in memory so that nothing is based
5512  * on a global size.
5513  * */
5514 static PetscErrorCode MatCreateSeqSubMatrixWithRows_Private(Mat P, IS rows, Mat *P_oth)
5515 {
5516   Mat_MPIAIJ            *p  = (Mat_MPIAIJ *)P->data;
5517   Mat_SeqAIJ            *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data, *p_oth;
5518   PetscInt               plocalsize, nrows, *ilocal, *oilocal, i, lidx, *nrcols, *nlcols, ncol;
5519   PetscMPIInt            owner;
5520   PetscSFNode           *iremote, *oiremote;
5521   const PetscInt        *lrowindices;
5522   PetscSF                sf, osf;
5523   PetscInt               pcstart, *roffsets, *loffsets, *pnnz, j;
5524   PetscInt               ontotalcols, dntotalcols, ntotalcols, nout;
5525   MPI_Comm               comm;
5526   ISLocalToGlobalMapping mapping;
5527   const PetscScalar     *pd_a, *po_a;
5528 
5529   PetscFunctionBegin;
5530   PetscCall(PetscObjectGetComm((PetscObject)P, &comm));
5531   /* plocalsize is the number of roots
5532    * nrows is the number of leaves
5533    * */
5534   PetscCall(MatGetLocalSize(P, &plocalsize, NULL));
5535   PetscCall(ISGetLocalSize(rows, &nrows));
5536   PetscCall(PetscCalloc1(nrows, &iremote));
5537   PetscCall(ISGetIndices(rows, &lrowindices));
5538   for (i = 0; i < nrows; i++) {
5539     /* Find a remote index and an owner for a row
5540      * The row could be local or remote
5541      * */
5542     owner = 0;
5543     lidx  = 0;
5544     PetscCall(PetscLayoutFindOwnerIndex(P->rmap, lrowindices[i], &owner, &lidx));
5545     iremote[i].index = lidx;
5546     iremote[i].rank  = owner;
5547   }
5548   /* Create SF to communicate how many nonzero columns for each row */
5549   PetscCall(PetscSFCreate(comm, &sf));
5550   /* SF will figure out the number of nonzero columns for each row, and their
5551    * offsets
5552    * */
5553   PetscCall(PetscSFSetGraph(sf, plocalsize, nrows, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
5554   PetscCall(PetscSFSetFromOptions(sf));
5555   PetscCall(PetscSFSetUp(sf));
5556 
5557   PetscCall(PetscCalloc1(2 * (plocalsize + 1), &roffsets));
5558   PetscCall(PetscCalloc1(2 * plocalsize, &nrcols));
5559   PetscCall(PetscCalloc1(nrows, &pnnz));
5560   roffsets[0] = 0;
5561   roffsets[1] = 0;
5562   for (i = 0; i < plocalsize; i++) {
5563     /* diagonal */
5564     nrcols[i * 2 + 0] = pd->i[i + 1] - pd->i[i];
5565     /* off-diagonal */
5566     nrcols[i * 2 + 1] = po->i[i + 1] - po->i[i];
5567     /* compute offsets so that we relative location for each row */
5568     roffsets[(i + 1) * 2 + 0] = roffsets[i * 2 + 0] + nrcols[i * 2 + 0];
5569     roffsets[(i + 1) * 2 + 1] = roffsets[i * 2 + 1] + nrcols[i * 2 + 1];
5570   }
5571   PetscCall(PetscCalloc1(2 * nrows, &nlcols));
5572   PetscCall(PetscCalloc1(2 * nrows, &loffsets));
5573   /* 'r' means root, and 'l' means leaf */
5574   PetscCall(PetscSFBcastBegin(sf, MPIU_2INT, nrcols, nlcols, MPI_REPLACE));
5575   PetscCall(PetscSFBcastBegin(sf, MPIU_2INT, roffsets, loffsets, MPI_REPLACE));
5576   PetscCall(PetscSFBcastEnd(sf, MPIU_2INT, nrcols, nlcols, MPI_REPLACE));
5577   PetscCall(PetscSFBcastEnd(sf, MPIU_2INT, roffsets, loffsets, MPI_REPLACE));
5578   PetscCall(PetscSFDestroy(&sf));
5579   PetscCall(PetscFree(roffsets));
5580   PetscCall(PetscFree(nrcols));
5581   dntotalcols = 0;
5582   ontotalcols = 0;
5583   ncol        = 0;
5584   for (i = 0; i < nrows; i++) {
5585     pnnz[i] = nlcols[i * 2 + 0] + nlcols[i * 2 + 1];
5586     ncol    = PetscMax(pnnz[i], ncol);
5587     /* diagonal */
5588     dntotalcols += nlcols[i * 2 + 0];
5589     /* off-diagonal */
5590     ontotalcols += nlcols[i * 2 + 1];
5591   }
5592   /* We do not need to figure the right number of columns
5593    * since all the calculations will be done by going through the raw data
5594    * */
5595   PetscCall(MatCreateSeqAIJ(PETSC_COMM_SELF, nrows, ncol, 0, pnnz, P_oth));
5596   PetscCall(MatSetUp(*P_oth));
5597   PetscCall(PetscFree(pnnz));
5598   p_oth = (Mat_SeqAIJ *)(*P_oth)->data;
5599   /* diagonal */
5600   PetscCall(PetscCalloc1(dntotalcols, &iremote));
5601   /* off-diagonal */
5602   PetscCall(PetscCalloc1(ontotalcols, &oiremote));
5603   /* diagonal */
5604   PetscCall(PetscCalloc1(dntotalcols, &ilocal));
5605   /* off-diagonal */
5606   PetscCall(PetscCalloc1(ontotalcols, &oilocal));
5607   dntotalcols = 0;
5608   ontotalcols = 0;
5609   ntotalcols  = 0;
5610   for (i = 0; i < nrows; i++) {
5611     owner = 0;
5612     PetscCall(PetscLayoutFindOwnerIndex(P->rmap, lrowindices[i], &owner, NULL));
5613     /* Set iremote for diag matrix */
5614     for (j = 0; j < nlcols[i * 2 + 0]; j++) {
5615       iremote[dntotalcols].index = loffsets[i * 2 + 0] + j;
5616       iremote[dntotalcols].rank  = owner;
5617       /* P_oth is seqAIJ so that ilocal need to point to the first part of memory */
5618       ilocal[dntotalcols++] = ntotalcols++;
5619     }
5620     /* off-diagonal */
5621     for (j = 0; j < nlcols[i * 2 + 1]; j++) {
5622       oiremote[ontotalcols].index = loffsets[i * 2 + 1] + j;
5623       oiremote[ontotalcols].rank  = owner;
5624       oilocal[ontotalcols++]      = ntotalcols++;
5625     }
5626   }
5627   PetscCall(ISRestoreIndices(rows, &lrowindices));
5628   PetscCall(PetscFree(loffsets));
5629   PetscCall(PetscFree(nlcols));
5630   PetscCall(PetscSFCreate(comm, &sf));
5631   /* P serves as roots and P_oth is leaves
5632    * Diag matrix
5633    * */
5634   PetscCall(PetscSFSetGraph(sf, pd->i[plocalsize], dntotalcols, ilocal, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
5635   PetscCall(PetscSFSetFromOptions(sf));
5636   PetscCall(PetscSFSetUp(sf));
5637 
5638   PetscCall(PetscSFCreate(comm, &osf));
5639   /* off-diagonal */
5640   PetscCall(PetscSFSetGraph(osf, po->i[plocalsize], ontotalcols, oilocal, PETSC_OWN_POINTER, oiremote, PETSC_OWN_POINTER));
5641   PetscCall(PetscSFSetFromOptions(osf));
5642   PetscCall(PetscSFSetUp(osf));
5643   PetscCall(MatSeqAIJGetArrayRead(p->A, &pd_a));
5644   PetscCall(MatSeqAIJGetArrayRead(p->B, &po_a));
5645   /* operate on the matrix internal data to save memory */
5646   PetscCall(PetscSFBcastBegin(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5647   PetscCall(PetscSFBcastBegin(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5648   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, NULL));
5649   /* Convert to global indices for diag matrix */
5650   for (i = 0; i < pd->i[plocalsize]; i++) pd->j[i] += pcstart;
5651   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, pd->j, p_oth->j, MPI_REPLACE));
5652   /* We want P_oth store global indices */
5653   PetscCall(ISLocalToGlobalMappingCreate(comm, 1, p->B->cmap->n, p->garray, PETSC_COPY_VALUES, &mapping));
5654   /* Use memory scalable approach */
5655   PetscCall(ISLocalToGlobalMappingSetType(mapping, ISLOCALTOGLOBALMAPPINGHASH));
5656   PetscCall(ISLocalToGlobalMappingApply(mapping, po->i[plocalsize], po->j, po->j));
5657   PetscCall(PetscSFBcastBegin(osf, MPIU_INT, po->j, p_oth->j, MPI_REPLACE));
5658   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, pd->j, p_oth->j, MPI_REPLACE));
5659   /* Convert back to local indices */
5660   for (i = 0; i < pd->i[plocalsize]; i++) pd->j[i] -= pcstart;
5661   PetscCall(PetscSFBcastEnd(osf, MPIU_INT, po->j, p_oth->j, MPI_REPLACE));
5662   nout = 0;
5663   PetscCall(ISGlobalToLocalMappingApply(mapping, IS_GTOLM_DROP, po->i[plocalsize], po->j, &nout, po->j));
5664   PetscCheck(nout == po->i[plocalsize], comm, PETSC_ERR_ARG_INCOMP, "n %" PetscInt_FMT " does not equal to nout %" PetscInt_FMT " ", po->i[plocalsize], nout);
5665   PetscCall(ISLocalToGlobalMappingDestroy(&mapping));
5666   /* Exchange values */
5667   PetscCall(PetscSFBcastEnd(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5668   PetscCall(PetscSFBcastEnd(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5669   PetscCall(MatSeqAIJRestoreArrayRead(p->A, &pd_a));
5670   PetscCall(MatSeqAIJRestoreArrayRead(p->B, &po_a));
5671   /* Stop PETSc from shrinking memory */
5672   for (i = 0; i < nrows; i++) p_oth->ilen[i] = p_oth->imax[i];
5673   PetscCall(MatAssemblyBegin(*P_oth, MAT_FINAL_ASSEMBLY));
5674   PetscCall(MatAssemblyEnd(*P_oth, MAT_FINAL_ASSEMBLY));
5675   /* Attach PetscSF objects to P_oth so that we can reuse it later */
5676   PetscCall(PetscObjectCompose((PetscObject)*P_oth, "diagsf", (PetscObject)sf));
5677   PetscCall(PetscObjectCompose((PetscObject)*P_oth, "offdiagsf", (PetscObject)osf));
5678   PetscCall(PetscSFDestroy(&sf));
5679   PetscCall(PetscSFDestroy(&osf));
5680   PetscFunctionReturn(PETSC_SUCCESS);
5681 }
5682 
5683 /*
5684  * Creates a SeqAIJ matrix by taking rows of B that equal to nonzero columns of local A
5685  * This supports MPIAIJ and MAIJ
5686  * */
5687 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat A, Mat P, PetscInt dof, MatReuse reuse, Mat *P_oth)
5688 {
5689   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
5690   Mat_SeqAIJ *p_oth;
5691   IS          rows, map;
5692   PetscHMapI  hamp;
5693   PetscInt    i, htsize, *rowindices, off, *mapping, key, count;
5694   MPI_Comm    comm;
5695   PetscSF     sf, osf;
5696   PetscBool   has;
5697 
5698   PetscFunctionBegin;
5699   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
5700   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAocols, A, P, 0, 0));
5701   /* If it is the first time, create an index set of off-diag nonzero columns of A,
5702    *  and then create a submatrix (that often is an overlapping matrix)
5703    * */
5704   if (reuse == MAT_INITIAL_MATRIX) {
5705     /* Use a hash table to figure out unique keys */
5706     PetscCall(PetscHMapICreateWithSize(a->B->cmap->n, &hamp));
5707     PetscCall(PetscCalloc1(a->B->cmap->n, &mapping));
5708     count = 0;
5709     /* Assume that  a->g is sorted, otherwise the following does not make sense */
5710     for (i = 0; i < a->B->cmap->n; i++) {
5711       key = a->garray[i] / dof;
5712       PetscCall(PetscHMapIHas(hamp, key, &has));
5713       if (!has) {
5714         mapping[i] = count;
5715         PetscCall(PetscHMapISet(hamp, key, count++));
5716       } else {
5717         /* Current 'i' has the same value the previous step */
5718         mapping[i] = count - 1;
5719       }
5720     }
5721     PetscCall(ISCreateGeneral(comm, a->B->cmap->n, mapping, PETSC_OWN_POINTER, &map));
5722     PetscCall(PetscHMapIGetSize(hamp, &htsize));
5723     PetscCheck(htsize == count, comm, PETSC_ERR_ARG_INCOMP, " Size of hash map %" PetscInt_FMT " is inconsistent with count %" PetscInt_FMT, htsize, count);
5724     PetscCall(PetscCalloc1(htsize, &rowindices));
5725     off = 0;
5726     PetscCall(PetscHMapIGetKeys(hamp, &off, rowindices));
5727     PetscCall(PetscHMapIDestroy(&hamp));
5728     PetscCall(PetscSortInt(htsize, rowindices));
5729     PetscCall(ISCreateGeneral(comm, htsize, rowindices, PETSC_OWN_POINTER, &rows));
5730     /* In case, the matrix was already created but users want to recreate the matrix */
5731     PetscCall(MatDestroy(P_oth));
5732     PetscCall(MatCreateSeqSubMatrixWithRows_Private(P, rows, P_oth));
5733     PetscCall(PetscObjectCompose((PetscObject)*P_oth, "aoffdiagtopothmapping", (PetscObject)map));
5734     PetscCall(ISDestroy(&map));
5735     PetscCall(ISDestroy(&rows));
5736   } else if (reuse == MAT_REUSE_MATRIX) {
5737     /* If matrix was already created, we simply update values using SF objects
5738      * that as attached to the matrix earlier.
5739      */
5740     const PetscScalar *pd_a, *po_a;
5741 
5742     PetscCall(PetscObjectQuery((PetscObject)*P_oth, "diagsf", (PetscObject *)&sf));
5743     PetscCall(PetscObjectQuery((PetscObject)*P_oth, "offdiagsf", (PetscObject *)&osf));
5744     PetscCheck(sf && osf, comm, PETSC_ERR_ARG_NULL, "Matrix is not initialized yet");
5745     p_oth = (Mat_SeqAIJ *)(*P_oth)->data;
5746     /* Update values in place */
5747     PetscCall(MatSeqAIJGetArrayRead(p->A, &pd_a));
5748     PetscCall(MatSeqAIJGetArrayRead(p->B, &po_a));
5749     PetscCall(PetscSFBcastBegin(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5750     PetscCall(PetscSFBcastBegin(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5751     PetscCall(PetscSFBcastEnd(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5752     PetscCall(PetscSFBcastEnd(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5753     PetscCall(MatSeqAIJRestoreArrayRead(p->A, &pd_a));
5754     PetscCall(MatSeqAIJRestoreArrayRead(p->B, &po_a));
5755   } else SETERRQ(comm, PETSC_ERR_ARG_UNKNOWN_TYPE, "Unknown reuse type");
5756   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAocols, A, P, 0, 0));
5757   PetscFunctionReturn(PETSC_SUCCESS);
5758 }
5759 
5760 /*@C
5761   MatGetBrowsOfAcols - Returns `IS` that contain rows of `B` that equal to nonzero columns of local `A`
5762 
5763   Collective
5764 
5765   Input Parameters:
5766 + A     - the first matrix in `MATMPIAIJ` format
5767 . B     - the second matrix in `MATMPIAIJ` format
5768 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5769 
5770   Output Parameters:
5771 + rowb  - On input index sets of rows of B to extract (or `NULL`), modified on output
5772 . colb  - On input index sets of columns of B to extract (or `NULL`), modified on output
5773 - B_seq - the sequential matrix generated
5774 
5775   Level: developer
5776 
5777 .seealso: `Mat`, `MATMPIAIJ`, `IS`, `MatReuse`
5778 @*/
5779 PetscErrorCode MatGetBrowsOfAcols(Mat A, Mat B, MatReuse scall, IS *rowb, IS *colb, Mat *B_seq)
5780 {
5781   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
5782   PetscInt   *idx, i, start, ncols, nzA, nzB, *cmap, imark;
5783   IS          isrowb, iscolb;
5784   Mat        *bseq = NULL;
5785 
5786   PetscFunctionBegin;
5787   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
5788              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
5789   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAcols, A, B, 0, 0));
5790 
5791   if (scall == MAT_INITIAL_MATRIX) {
5792     start = A->cmap->rstart;
5793     cmap  = a->garray;
5794     nzA   = a->A->cmap->n;
5795     nzB   = a->B->cmap->n;
5796     PetscCall(PetscMalloc1(nzA + nzB, &idx));
5797     ncols = 0;
5798     for (i = 0; i < nzB; i++) { /* row < local row index */
5799       if (cmap[i] < start) idx[ncols++] = cmap[i];
5800       else break;
5801     }
5802     imark = i;
5803     for (i = 0; i < nzA; i++) idx[ncols++] = start + i;   /* local rows */
5804     for (i = imark; i < nzB; i++) idx[ncols++] = cmap[i]; /* row > local row index */
5805     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, &isrowb));
5806     PetscCall(ISCreateStride(PETSC_COMM_SELF, B->cmap->N, 0, 1, &iscolb));
5807   } else {
5808     PetscCheck(rowb && colb, PETSC_COMM_SELF, PETSC_ERR_SUP, "IS rowb and colb must be provided for MAT_REUSE_MATRIX");
5809     isrowb = *rowb;
5810     iscolb = *colb;
5811     PetscCall(PetscMalloc1(1, &bseq));
5812     bseq[0] = *B_seq;
5813   }
5814   PetscCall(MatCreateSubMatrices(B, 1, &isrowb, &iscolb, scall, &bseq));
5815   *B_seq = bseq[0];
5816   PetscCall(PetscFree(bseq));
5817   if (!rowb) {
5818     PetscCall(ISDestroy(&isrowb));
5819   } else {
5820     *rowb = isrowb;
5821   }
5822   if (!colb) {
5823     PetscCall(ISDestroy(&iscolb));
5824   } else {
5825     *colb = iscolb;
5826   }
5827   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAcols, A, B, 0, 0));
5828   PetscFunctionReturn(PETSC_SUCCESS);
5829 }
5830 
5831 /*
5832     MatGetBrowsOfAoCols_MPIAIJ - Creates a `MATSEQAIJ` matrix by taking rows of B that equal to nonzero columns
5833     of the OFF-DIAGONAL portion of local A
5834 
5835     Collective
5836 
5837    Input Parameters:
5838 +    A,B - the matrices in `MATMPIAIJ` format
5839 -    scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5840 
5841    Output Parameter:
5842 +    startsj_s - starting point in B's sending j-arrays, saved for MAT_REUSE (or NULL)
5843 .    startsj_r - starting point in B's receiving j-arrays, saved for MAT_REUSE (or NULL)
5844 .    bufa_ptr - array for sending matrix values, saved for MAT_REUSE (or NULL)
5845 -    B_oth - the sequential matrix generated with size aBn=a->B->cmap->n by B->cmap->N
5846 
5847     Developer Note:
5848     This directly accesses information inside the VecScatter associated with the matrix-vector product
5849      for this matrix. This is not desirable..
5850 
5851     Level: developer
5852 
5853 */
5854 
5855 PetscErrorCode MatGetBrowsOfAoCols_MPIAIJ(Mat A, Mat B, MatReuse scall, PetscInt **startsj_s, PetscInt **startsj_r, MatScalar **bufa_ptr, Mat *B_oth)
5856 {
5857   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data;
5858   VecScatter         ctx;
5859   MPI_Comm           comm;
5860   const PetscMPIInt *rprocs, *sprocs;
5861   PetscMPIInt        nrecvs, nsends;
5862   const PetscInt    *srow, *rstarts, *sstarts;
5863   PetscInt          *rowlen, *bufj, *bufJ, ncols = 0, aBn = a->B->cmap->n, row, *b_othi, *b_othj, *rvalues = NULL, *svalues = NULL, *cols, sbs, rbs;
5864   PetscInt           i, j, k = 0, l, ll, nrows, *rstartsj = NULL, *sstartsj, len;
5865   PetscScalar       *b_otha, *bufa, *bufA, *vals = NULL;
5866   MPI_Request       *reqs = NULL, *rwaits = NULL, *swaits = NULL;
5867   PetscMPIInt        size, tag, rank, nreqs;
5868 
5869   PetscFunctionBegin;
5870   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
5871   PetscCallMPI(MPI_Comm_size(comm, &size));
5872 
5873   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
5874              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
5875   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAocols, A, B, 0, 0));
5876   PetscCallMPI(MPI_Comm_rank(comm, &rank));
5877 
5878   if (size == 1) {
5879     startsj_s = NULL;
5880     bufa_ptr  = NULL;
5881     *B_oth    = NULL;
5882     PetscFunctionReturn(PETSC_SUCCESS);
5883   }
5884 
5885   ctx = a->Mvctx;
5886   tag = ((PetscObject)ctx)->tag;
5887 
5888   PetscCall(VecScatterGetRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &srow, &sprocs, &sbs));
5889   /* rprocs[] must be ordered so that indices received from them are ordered in rvalues[], which is key to algorithms used in this subroutine */
5890   PetscCall(VecScatterGetRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL /*indices not needed*/, &rprocs, &rbs));
5891   PetscCall(PetscMPIIntCast(nsends + nrecvs, &nreqs));
5892   PetscCall(PetscMalloc1(nreqs, &reqs));
5893   rwaits = reqs;
5894   swaits = PetscSafePointerPlusOffset(reqs, nrecvs);
5895 
5896   if (!startsj_s || !bufa_ptr) scall = MAT_INITIAL_MATRIX;
5897   if (scall == MAT_INITIAL_MATRIX) {
5898     /* i-array */
5899     /*  post receives */
5900     if (nrecvs) PetscCall(PetscMalloc1(rbs * (rstarts[nrecvs] - rstarts[0]), &rvalues)); /* rstarts can be NULL when nrecvs=0 */
5901     for (i = 0; i < nrecvs; i++) {
5902       rowlen = rvalues + rstarts[i] * rbs;
5903       nrows  = (rstarts[i + 1] - rstarts[i]) * rbs; /* num of indices to be received */
5904       PetscCallMPI(MPIU_Irecv(rowlen, nrows, MPIU_INT, rprocs[i], tag, comm, rwaits + i));
5905     }
5906 
5907     /* pack the outgoing message */
5908     PetscCall(PetscMalloc2(nsends + 1, &sstartsj, nrecvs + 1, &rstartsj));
5909 
5910     sstartsj[0] = 0;
5911     rstartsj[0] = 0;
5912     len         = 0; /* total length of j or a array to be sent */
5913     if (nsends) {
5914       k = sstarts[0]; /* ATTENTION: sstarts[0] and rstarts[0] are not necessarily zero */
5915       PetscCall(PetscMalloc1(sbs * (sstarts[nsends] - sstarts[0]), &svalues));
5916     }
5917     for (i = 0; i < nsends; i++) {
5918       rowlen = svalues + (sstarts[i] - sstarts[0]) * sbs;
5919       nrows  = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5920       for (j = 0; j < nrows; j++) {
5921         row = srow[k] + B->rmap->range[rank]; /* global row idx */
5922         for (l = 0; l < sbs; l++) {
5923           PetscCall(MatGetRow_MPIAIJ(B, row + l, &ncols, NULL, NULL)); /* rowlength */
5924 
5925           rowlen[j * sbs + l] = ncols;
5926 
5927           len += ncols;
5928           PetscCall(MatRestoreRow_MPIAIJ(B, row + l, &ncols, NULL, NULL));
5929         }
5930         k++;
5931       }
5932       PetscCallMPI(MPIU_Isend(rowlen, nrows * sbs, MPIU_INT, sprocs[i], tag, comm, swaits + i));
5933 
5934       sstartsj[i + 1] = len; /* starting point of (i+1)-th outgoing msg in bufj and bufa */
5935     }
5936     /* recvs and sends of i-array are completed */
5937     if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5938     PetscCall(PetscFree(svalues));
5939 
5940     /* allocate buffers for sending j and a arrays */
5941     PetscCall(PetscMalloc1(len + 1, &bufj));
5942     PetscCall(PetscMalloc1(len + 1, &bufa));
5943 
5944     /* create i-array of B_oth */
5945     PetscCall(PetscMalloc1(aBn + 2, &b_othi));
5946 
5947     b_othi[0] = 0;
5948     len       = 0; /* total length of j or a array to be received */
5949     k         = 0;
5950     for (i = 0; i < nrecvs; i++) {
5951       rowlen = rvalues + (rstarts[i] - rstarts[0]) * rbs;
5952       nrows  = (rstarts[i + 1] - rstarts[i]) * rbs; /* num of rows to be received */
5953       for (j = 0; j < nrows; j++) {
5954         b_othi[k + 1] = b_othi[k] + rowlen[j];
5955         PetscCall(PetscIntSumError(rowlen[j], len, &len));
5956         k++;
5957       }
5958       rstartsj[i + 1] = len; /* starting point of (i+1)-th incoming msg in bufj and bufa */
5959     }
5960     PetscCall(PetscFree(rvalues));
5961 
5962     /* allocate space for j and a arrays of B_oth */
5963     PetscCall(PetscMalloc1(b_othi[aBn] + 1, &b_othj));
5964     PetscCall(PetscMalloc1(b_othi[aBn] + 1, &b_otha));
5965 
5966     /* j-array */
5967     /*  post receives of j-array */
5968     for (i = 0; i < nrecvs; i++) {
5969       nrows = rstartsj[i + 1] - rstartsj[i]; /* length of the msg received */
5970       PetscCallMPI(MPIU_Irecv(b_othj + rstartsj[i], nrows, MPIU_INT, rprocs[i], tag, comm, rwaits + i));
5971     }
5972 
5973     /* pack the outgoing message j-array */
5974     if (nsends) k = sstarts[0];
5975     for (i = 0; i < nsends; i++) {
5976       nrows = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5977       bufJ  = bufj + sstartsj[i];
5978       for (j = 0; j < nrows; j++) {
5979         row = srow[k++] + B->rmap->range[rank]; /* global row idx */
5980         for (ll = 0; ll < sbs; ll++) {
5981           PetscCall(MatGetRow_MPIAIJ(B, row + ll, &ncols, &cols, NULL));
5982           for (l = 0; l < ncols; l++) *bufJ++ = cols[l];
5983           PetscCall(MatRestoreRow_MPIAIJ(B, row + ll, &ncols, &cols, NULL));
5984         }
5985       }
5986       PetscCallMPI(MPIU_Isend(bufj + sstartsj[i], sstartsj[i + 1] - sstartsj[i], MPIU_INT, sprocs[i], tag, comm, swaits + i));
5987     }
5988 
5989     /* recvs and sends of j-array are completed */
5990     if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5991   } else if (scall == MAT_REUSE_MATRIX) {
5992     sstartsj = *startsj_s;
5993     rstartsj = *startsj_r;
5994     bufa     = *bufa_ptr;
5995     PetscCall(MatSeqAIJGetArrayWrite(*B_oth, &b_otha));
5996   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not possess an object container");
5997 
5998   /* a-array */
5999   /*  post receives of a-array */
6000   for (i = 0; i < nrecvs; i++) {
6001     nrows = rstartsj[i + 1] - rstartsj[i]; /* length of the msg received */
6002     PetscCallMPI(MPIU_Irecv(b_otha + rstartsj[i], nrows, MPIU_SCALAR, rprocs[i], tag, comm, rwaits + i));
6003   }
6004 
6005   /* pack the outgoing message a-array */
6006   if (nsends) k = sstarts[0];
6007   for (i = 0; i < nsends; i++) {
6008     nrows = sstarts[i + 1] - sstarts[i]; /* num of block rows */
6009     bufA  = bufa + sstartsj[i];
6010     for (j = 0; j < nrows; j++) {
6011       row = srow[k++] + B->rmap->range[rank]; /* global row idx */
6012       for (ll = 0; ll < sbs; ll++) {
6013         PetscCall(MatGetRow_MPIAIJ(B, row + ll, &ncols, NULL, &vals));
6014         for (l = 0; l < ncols; l++) *bufA++ = vals[l];
6015         PetscCall(MatRestoreRow_MPIAIJ(B, row + ll, &ncols, NULL, &vals));
6016       }
6017     }
6018     PetscCallMPI(MPIU_Isend(bufa + sstartsj[i], sstartsj[i + 1] - sstartsj[i], MPIU_SCALAR, sprocs[i], tag, comm, swaits + i));
6019   }
6020   /* recvs and sends of a-array are completed */
6021   if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
6022   PetscCall(PetscFree(reqs));
6023 
6024   if (scall == MAT_INITIAL_MATRIX) {
6025     Mat_SeqAIJ *b_oth;
6026 
6027     /* put together the new matrix */
6028     PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, aBn, B->cmap->N, b_othi, b_othj, b_otha, B_oth));
6029 
6030     /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
6031     /* Since these are PETSc arrays, change flags to free them as necessary. */
6032     b_oth          = (Mat_SeqAIJ *)(*B_oth)->data;
6033     b_oth->free_a  = PETSC_TRUE;
6034     b_oth->free_ij = PETSC_TRUE;
6035     b_oth->nonew   = 0;
6036 
6037     PetscCall(PetscFree(bufj));
6038     if (!startsj_s || !bufa_ptr) {
6039       PetscCall(PetscFree2(sstartsj, rstartsj));
6040       PetscCall(PetscFree(bufa_ptr));
6041     } else {
6042       *startsj_s = sstartsj;
6043       *startsj_r = rstartsj;
6044       *bufa_ptr  = bufa;
6045     }
6046   } else if (scall == MAT_REUSE_MATRIX) {
6047     PetscCall(MatSeqAIJRestoreArrayWrite(*B_oth, &b_otha));
6048   }
6049 
6050   PetscCall(VecScatterRestoreRemote_Private(ctx, PETSC_TRUE, &nsends, &sstarts, &srow, &sprocs, &sbs));
6051   PetscCall(VecScatterRestoreRemoteOrdered_Private(ctx, PETSC_FALSE, &nrecvs, &rstarts, NULL, &rprocs, &rbs));
6052   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAocols, A, B, 0, 0));
6053   PetscFunctionReturn(PETSC_SUCCESS);
6054 }
6055 
6056 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCRL(Mat, MatType, MatReuse, Mat *);
6057 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJPERM(Mat, MatType, MatReuse, Mat *);
6058 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJSELL(Mat, MatType, MatReuse, Mat *);
6059 #if defined(PETSC_HAVE_MKL_SPARSE)
6060 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJMKL(Mat, MatType, MatReuse, Mat *);
6061 #endif
6062 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIBAIJ(Mat, MatType, MatReuse, Mat *);
6063 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPISBAIJ(Mat, MatType, MatReuse, Mat *);
6064 #if defined(PETSC_HAVE_ELEMENTAL)
6065 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_Elemental(Mat, MatType, MatReuse, Mat *);
6066 #endif
6067 #if defined(PETSC_HAVE_SCALAPACK)
6068 PETSC_INTERN PetscErrorCode MatConvert_AIJ_ScaLAPACK(Mat, MatType, MatReuse, Mat *);
6069 #endif
6070 #if defined(PETSC_HAVE_HYPRE)
6071 PETSC_INTERN PetscErrorCode MatConvert_AIJ_HYPRE(Mat, MatType, MatReuse, Mat *);
6072 #endif
6073 #if defined(PETSC_HAVE_CUDA)
6074 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
6075 #endif
6076 #if defined(PETSC_HAVE_HIP)
6077 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
6078 #endif
6079 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
6080 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat, MatType, MatReuse, Mat *);
6081 #endif
6082 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPISELL(Mat, MatType, MatReuse, Mat *);
6083 PETSC_INTERN PetscErrorCode MatConvert_XAIJ_IS(Mat, MatType, MatReuse, Mat *);
6084 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_IS_XAIJ(Mat);
6085 
6086 /*
6087     Computes (B'*A')' since computing B*A directly is untenable
6088 
6089                n                       p                          p
6090         [             ]       [             ]         [                 ]
6091       m [      A      ]  *  n [       B     ]   =   m [         C       ]
6092         [             ]       [             ]         [                 ]
6093 
6094 */
6095 static PetscErrorCode MatMatMultNumeric_MPIDense_MPIAIJ(Mat A, Mat B, Mat C)
6096 {
6097   Mat At, Bt, Ct;
6098 
6099   PetscFunctionBegin;
6100   PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
6101   PetscCall(MatTranspose(B, MAT_INITIAL_MATRIX, &Bt));
6102   PetscCall(MatMatMult(Bt, At, MAT_INITIAL_MATRIX, PETSC_CURRENT, &Ct));
6103   PetscCall(MatDestroy(&At));
6104   PetscCall(MatDestroy(&Bt));
6105   PetscCall(MatTransposeSetPrecursor(Ct, C));
6106   PetscCall(MatTranspose(Ct, MAT_REUSE_MATRIX, &C));
6107   PetscCall(MatDestroy(&Ct));
6108   PetscFunctionReturn(PETSC_SUCCESS);
6109 }
6110 
6111 static PetscErrorCode MatMatMultSymbolic_MPIDense_MPIAIJ(Mat A, Mat B, PetscReal fill, Mat C)
6112 {
6113   PetscBool cisdense;
6114 
6115   PetscFunctionBegin;
6116   PetscCheck(A->cmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "A->cmap->n %" PetscInt_FMT " != B->rmap->n %" PetscInt_FMT, A->cmap->n, B->rmap->n);
6117   PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
6118   PetscCall(MatSetBlockSizesFromMats(C, A, B));
6119   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATMPIDENSE, MATMPIDENSECUDA, MATMPIDENSEHIP, ""));
6120   if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
6121   PetscCall(MatSetUp(C));
6122 
6123   C->ops->matmultnumeric = MatMatMultNumeric_MPIDense_MPIAIJ;
6124   PetscFunctionReturn(PETSC_SUCCESS);
6125 }
6126 
6127 static PetscErrorCode MatProductSetFromOptions_MPIDense_MPIAIJ_AB(Mat C)
6128 {
6129   Mat_Product *product = C->product;
6130   Mat          A = product->A, B = product->B;
6131 
6132   PetscFunctionBegin;
6133   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
6134              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
6135   C->ops->matmultsymbolic = MatMatMultSymbolic_MPIDense_MPIAIJ;
6136   C->ops->productsymbolic = MatProductSymbolic_AB;
6137   PetscFunctionReturn(PETSC_SUCCESS);
6138 }
6139 
6140 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIDense_MPIAIJ(Mat C)
6141 {
6142   Mat_Product *product = C->product;
6143 
6144   PetscFunctionBegin;
6145   if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_MPIDense_MPIAIJ_AB(C));
6146   PetscFunctionReturn(PETSC_SUCCESS);
6147 }
6148 
6149 /*
6150    Merge two sets of sorted nonzeros and return a CSR for the merged (sequential) matrix
6151 
6152   Input Parameters:
6153 
6154     j1,rowBegin1,rowEnd1,jmap1: describe the first set of nonzeros (Set1)
6155     j2,rowBegin2,rowEnd2,jmap2: describe the second set of nonzeros (Set2)
6156 
6157     mat: both sets' nonzeros are on m rows, where m is the number of local rows of the matrix mat
6158 
6159     For Set1, j1[] contains column indices of the nonzeros.
6160     For the k-th row (0<=k<m), [rowBegin1[k],rowEnd1[k]) index into j1[] and point to the begin/end nonzero in row k
6161     respectively (note rowEnd1[k] is not necessarily equal to rwoBegin1[k+1]). Indices in this range of j1[] are sorted,
6162     but might have repeats. jmap1[t+1] - jmap1[t] is the number of repeats for the t-th unique nonzero in Set1.
6163 
6164     Similar for Set2.
6165 
6166     This routine merges the two sets of nonzeros row by row and removes repeats.
6167 
6168   Output Parameters: (memory is allocated by the caller)
6169 
6170     i[],j[]: the CSR of the merged matrix, which has m rows.
6171     imap1[]: the k-th unique nonzero in Set1 (k=0,1,...) corresponds to imap1[k]-th unique nonzero in the merged matrix.
6172     imap2[]: similar to imap1[], but for Set2.
6173     Note we order nonzeros row-by-row and from left to right.
6174 */
6175 static PetscErrorCode MatMergeEntries_Internal(Mat mat, const PetscInt j1[], const PetscInt j2[], const PetscCount rowBegin1[], const PetscCount rowEnd1[], const PetscCount rowBegin2[], const PetscCount rowEnd2[], const PetscCount jmap1[], const PetscCount jmap2[], PetscCount imap1[], PetscCount imap2[], PetscInt i[], PetscInt j[])
6176 {
6177   PetscInt   r, m; /* Row index of mat */
6178   PetscCount t, t1, t2, b1, e1, b2, e2;
6179 
6180   PetscFunctionBegin;
6181   PetscCall(MatGetLocalSize(mat, &m, NULL));
6182   t1 = t2 = t = 0; /* Count unique nonzeros of in Set1, Set1 and the merged respectively */
6183   i[0]        = 0;
6184   for (r = 0; r < m; r++) { /* Do row by row merging */
6185     b1 = rowBegin1[r];
6186     e1 = rowEnd1[r];
6187     b2 = rowBegin2[r];
6188     e2 = rowEnd2[r];
6189     while (b1 < e1 && b2 < e2) {
6190       if (j1[b1] == j2[b2]) { /* Same column index and hence same nonzero */
6191         j[t]      = j1[b1];
6192         imap1[t1] = t;
6193         imap2[t2] = t;
6194         b1 += jmap1[t1 + 1] - jmap1[t1]; /* Jump to next unique local nonzero */
6195         b2 += jmap2[t2 + 1] - jmap2[t2]; /* Jump to next unique remote nonzero */
6196         t1++;
6197         t2++;
6198         t++;
6199       } else if (j1[b1] < j2[b2]) {
6200         j[t]      = j1[b1];
6201         imap1[t1] = t;
6202         b1 += jmap1[t1 + 1] - jmap1[t1];
6203         t1++;
6204         t++;
6205       } else {
6206         j[t]      = j2[b2];
6207         imap2[t2] = t;
6208         b2 += jmap2[t2 + 1] - jmap2[t2];
6209         t2++;
6210         t++;
6211       }
6212     }
6213     /* Merge the remaining in either j1[] or j2[] */
6214     while (b1 < e1) {
6215       j[t]      = j1[b1];
6216       imap1[t1] = t;
6217       b1 += jmap1[t1 + 1] - jmap1[t1];
6218       t1++;
6219       t++;
6220     }
6221     while (b2 < e2) {
6222       j[t]      = j2[b2];
6223       imap2[t2] = t;
6224       b2 += jmap2[t2 + 1] - jmap2[t2];
6225       t2++;
6226       t++;
6227     }
6228     PetscCall(PetscIntCast(t, i + r + 1));
6229   }
6230   PetscFunctionReturn(PETSC_SUCCESS);
6231 }
6232 
6233 /*
6234   Split nonzeros in a block of local rows into two subsets: those in the diagonal block and those in the off-diagonal block
6235 
6236   Input Parameters:
6237     mat: an MPI matrix that provides row and column layout information for splitting. Let's say its number of local rows is m.
6238     n,i[],j[],perm[]: there are n input entries, belonging to m rows. Row/col indices of the entries are stored in i[] and j[]
6239       respectively, along with a permutation array perm[]. Length of the i[],j[],perm[] arrays is n.
6240 
6241       i[] is already sorted, but within a row, j[] is not sorted and might have repeats.
6242       i[] might contain negative indices at the beginning, which means the corresponding entries should be ignored in the splitting.
6243 
6244   Output Parameters:
6245     j[],perm[]: the routine needs to sort j[] within each row along with perm[].
6246     rowBegin[],rowMid[],rowEnd[]: of length m, and the memory is preallocated and zeroed by the caller.
6247       They contain indices pointing to j[]. For 0<=r<m, [rowBegin[r],rowMid[r]) point to begin/end entries of row r of the diagonal block,
6248       and [rowMid[r],rowEnd[r]) point to begin/end entries of row r of the off-diagonal block.
6249 
6250     Aperm[],Ajmap[],Atot,Annz: Arrays are allocated by this routine.
6251       Atot: number of entries belonging to the diagonal block.
6252       Annz: number of unique nonzeros belonging to the diagonal block.
6253       Aperm[Atot] stores values from perm[] for entries belonging to the diagonal block. Length of Aperm[] is Atot, though it may also count
6254         repeats (i.e., same 'i,j' pair).
6255       Ajmap[Annz+1] stores the number of repeats of each unique entry belonging to the diagonal block. More precisely, Ajmap[t+1] - Ajmap[t]
6256         is the number of repeats for the t-th unique entry in the diagonal block. Ajmap[0] is always 0.
6257 
6258       Atot: number of entries belonging to the diagonal block
6259       Annz: number of unique nonzeros belonging to the diagonal block.
6260 
6261     Bperm[], Bjmap[], Btot, Bnnz are similar but for the off-diagonal block.
6262 
6263     Aperm[],Bperm[],Ajmap[] and Bjmap[] are allocated separately by this routine with PetscMalloc1().
6264 */
6265 static PetscErrorCode MatSplitEntries_Internal(Mat mat, PetscCount n, const PetscInt i[], PetscInt j[], PetscCount perm[], PetscCount rowBegin[], PetscCount rowMid[], PetscCount rowEnd[], PetscCount *Atot_, PetscCount **Aperm_, PetscCount *Annz_, PetscCount **Ajmap_, PetscCount *Btot_, PetscCount **Bperm_, PetscCount *Bnnz_, PetscCount **Bjmap_)
6266 {
6267   PetscInt    cstart, cend, rstart, rend, row, col;
6268   PetscCount  Atot = 0, Btot = 0; /* Total number of nonzeros in the diagonal and off-diagonal blocks */
6269   PetscCount  Annz = 0, Bnnz = 0; /* Number of unique nonzeros in the diagonal and off-diagonal blocks */
6270   PetscCount  k, m, p, q, r, s, mid;
6271   PetscCount *Aperm, *Bperm, *Ajmap, *Bjmap;
6272 
6273   PetscFunctionBegin;
6274   PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
6275   PetscCall(PetscLayoutGetRange(mat->cmap, &cstart, &cend));
6276   m = rend - rstart;
6277 
6278   /* Skip negative rows */
6279   for (k = 0; k < n; k++)
6280     if (i[k] >= 0) break;
6281 
6282   /* Process [k,n): sort and partition each local row into diag and offdiag portions,
6283      fill rowBegin[], rowMid[], rowEnd[], and count Atot, Btot, Annz, Bnnz.
6284   */
6285   while (k < n) {
6286     row = i[k];
6287     /* Entries in [k,s) are in one row. Shift diagonal block col indices so that diag is ahead of offdiag after sorting the row */
6288     for (s = k; s < n; s++)
6289       if (i[s] != row) break;
6290 
6291     /* Shift diag columns to range of [-PETSC_INT_MAX, -1] */
6292     for (p = k; p < s; p++) {
6293       if (j[p] >= cstart && j[p] < cend) j[p] -= PETSC_INT_MAX;
6294       else PetscAssert((j[p] >= 0) && (j[p] <= mat->cmap->N), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column index %" PetscInt_FMT " is out of range", j[p]);
6295     }
6296     PetscCall(PetscSortIntWithCountArray(s - k, j + k, perm + k));
6297     PetscCall(PetscSortedIntUpperBound(j, k, s, -1, &mid)); /* Separate [k,s) into [k,mid) for diag and [mid,s) for offdiag */
6298     rowBegin[row - rstart] = k;
6299     rowMid[row - rstart]   = mid;
6300     rowEnd[row - rstart]   = s;
6301 
6302     /* Count nonzeros of this diag/offdiag row, which might have repeats */
6303     Atot += mid - k;
6304     Btot += s - mid;
6305 
6306     /* Count unique nonzeros of this diag row */
6307     for (p = k; p < mid;) {
6308       col = j[p];
6309       do {
6310         j[p] += PETSC_INT_MAX; /* Revert the modified diagonal indices */
6311         p++;
6312       } while (p < mid && j[p] == col);
6313       Annz++;
6314     }
6315 
6316     /* Count unique nonzeros of this offdiag row */
6317     for (p = mid; p < s;) {
6318       col = j[p];
6319       do {
6320         p++;
6321       } while (p < s && j[p] == col);
6322       Bnnz++;
6323     }
6324     k = s;
6325   }
6326 
6327   /* Allocation according to Atot, Btot, Annz, Bnnz */
6328   PetscCall(PetscMalloc1(Atot, &Aperm));
6329   PetscCall(PetscMalloc1(Btot, &Bperm));
6330   PetscCall(PetscMalloc1(Annz + 1, &Ajmap));
6331   PetscCall(PetscMalloc1(Bnnz + 1, &Bjmap));
6332 
6333   /* Re-scan indices and copy diag/offdiag permutation indices to Aperm, Bperm and also fill Ajmap and Bjmap */
6334   Ajmap[0] = Bjmap[0] = Atot = Btot = Annz = Bnnz = 0;
6335   for (r = 0; r < m; r++) {
6336     k   = rowBegin[r];
6337     mid = rowMid[r];
6338     s   = rowEnd[r];
6339     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(Aperm, Atot), PetscSafePointerPlusOffset(perm, k), mid - k));
6340     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(Bperm, Btot), PetscSafePointerPlusOffset(perm, mid), s - mid));
6341     Atot += mid - k;
6342     Btot += s - mid;
6343 
6344     /* Scan column indices in this row and find out how many repeats each unique nonzero has */
6345     for (p = k; p < mid;) {
6346       col = j[p];
6347       q   = p;
6348       do {
6349         p++;
6350       } while (p < mid && j[p] == col);
6351       Ajmap[Annz + 1] = Ajmap[Annz] + (p - q);
6352       Annz++;
6353     }
6354 
6355     for (p = mid; p < s;) {
6356       col = j[p];
6357       q   = p;
6358       do {
6359         p++;
6360       } while (p < s && j[p] == col);
6361       Bjmap[Bnnz + 1] = Bjmap[Bnnz] + (p - q);
6362       Bnnz++;
6363     }
6364   }
6365   /* Output */
6366   *Aperm_ = Aperm;
6367   *Annz_  = Annz;
6368   *Atot_  = Atot;
6369   *Ajmap_ = Ajmap;
6370   *Bperm_ = Bperm;
6371   *Bnnz_  = Bnnz;
6372   *Btot_  = Btot;
6373   *Bjmap_ = Bjmap;
6374   PetscFunctionReturn(PETSC_SUCCESS);
6375 }
6376 
6377 /*
6378   Expand the jmap[] array to make a new one in view of nonzeros in the merged matrix
6379 
6380   Input Parameters:
6381     nnz1: number of unique nonzeros in a set that was used to produce imap[], jmap[]
6382     nnz:  number of unique nonzeros in the merged matrix
6383     imap[nnz1]: i-th nonzero in the set is the imap[i]-th nonzero in the merged matrix
6384     jmap[nnz1+1]: i-th nonzero in the set has jmap[i+1] - jmap[i] repeats in the set
6385 
6386   Output Parameter: (memory is allocated by the caller)
6387     jmap_new[nnz+1]: i-th nonzero in the merged matrix has jmap_new[i+1] - jmap_new[i] repeats in the set
6388 
6389   Example:
6390     nnz1 = 4
6391     nnz  = 6
6392     imap = [1,3,4,5]
6393     jmap = [0,3,5,6,7]
6394    then,
6395     jmap_new = [0,0,3,3,5,6,7]
6396 */
6397 static PetscErrorCode ExpandJmap_Internal(PetscCount nnz1, PetscCount nnz, const PetscCount imap[], const PetscCount jmap[], PetscCount jmap_new[])
6398 {
6399   PetscCount k, p;
6400 
6401   PetscFunctionBegin;
6402   jmap_new[0] = 0;
6403   p           = nnz;                /* p loops over jmap_new[] backwards */
6404   for (k = nnz1 - 1; k >= 0; k--) { /* k loops over imap[] */
6405     for (; p > imap[k]; p--) jmap_new[p] = jmap[k + 1];
6406   }
6407   for (; p >= 0; p--) jmap_new[p] = jmap[0];
6408   PetscFunctionReturn(PETSC_SUCCESS);
6409 }
6410 
6411 static PetscErrorCode MatCOOStructDestroy_MPIAIJ(void **data)
6412 {
6413   MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)*data;
6414 
6415   PetscFunctionBegin;
6416   PetscCall(PetscSFDestroy(&coo->sf));
6417   PetscCall(PetscFree(coo->Aperm1));
6418   PetscCall(PetscFree(coo->Bperm1));
6419   PetscCall(PetscFree(coo->Ajmap1));
6420   PetscCall(PetscFree(coo->Bjmap1));
6421   PetscCall(PetscFree(coo->Aimap2));
6422   PetscCall(PetscFree(coo->Bimap2));
6423   PetscCall(PetscFree(coo->Aperm2));
6424   PetscCall(PetscFree(coo->Bperm2));
6425   PetscCall(PetscFree(coo->Ajmap2));
6426   PetscCall(PetscFree(coo->Bjmap2));
6427   PetscCall(PetscFree(coo->Cperm1));
6428   PetscCall(PetscFree2(coo->sendbuf, coo->recvbuf));
6429   PetscCall(PetscFree(coo));
6430   PetscFunctionReturn(PETSC_SUCCESS);
6431 }
6432 
6433 PetscErrorCode MatSetPreallocationCOO_MPIAIJ(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
6434 {
6435   MPI_Comm             comm;
6436   PetscMPIInt          rank, size;
6437   PetscInt             m, n, M, N, rstart, rend, cstart, cend; /* Sizes, indices of row/col, therefore with type PetscInt */
6438   PetscCount           k, p, q, rem;                           /* Loop variables over coo arrays */
6439   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
6440   PetscContainer       container;
6441   MatCOOStruct_MPIAIJ *coo;
6442 
6443   PetscFunctionBegin;
6444   PetscCall(PetscFree(mpiaij->garray));
6445   PetscCall(VecDestroy(&mpiaij->lvec));
6446 #if defined(PETSC_USE_CTABLE)
6447   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
6448 #else
6449   PetscCall(PetscFree(mpiaij->colmap));
6450 #endif
6451   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
6452   mat->assembled     = PETSC_FALSE;
6453   mat->was_assembled = PETSC_FALSE;
6454 
6455   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
6456   PetscCallMPI(MPI_Comm_size(comm, &size));
6457   PetscCallMPI(MPI_Comm_rank(comm, &rank));
6458   PetscCall(PetscLayoutSetUp(mat->rmap));
6459   PetscCall(PetscLayoutSetUp(mat->cmap));
6460   PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
6461   PetscCall(PetscLayoutGetRange(mat->cmap, &cstart, &cend));
6462   PetscCall(MatGetLocalSize(mat, &m, &n));
6463   PetscCall(MatGetSize(mat, &M, &N));
6464 
6465   /* Sort (i,j) by row along with a permutation array, so that the to-be-ignored */
6466   /* entries come first, then local rows, then remote rows.                     */
6467   PetscCount n1 = coo_n, *perm1;
6468   PetscInt  *i1 = coo_i, *j1 = coo_j;
6469 
6470   PetscCall(PetscMalloc1(n1, &perm1));
6471   for (k = 0; k < n1; k++) perm1[k] = k;
6472 
6473   /* Manipulate indices so that entries with negative row or col indices will have smallest
6474      row indices, local entries will have greater but negative row indices, and remote entries
6475      will have positive row indices.
6476   */
6477   for (k = 0; k < n1; k++) {
6478     if (i1[k] < 0 || j1[k] < 0) i1[k] = PETSC_INT_MIN;                /* e.g., -2^31, minimal to move them ahead */
6479     else if (i1[k] >= rstart && i1[k] < rend) i1[k] -= PETSC_INT_MAX; /* e.g., minus 2^31-1 to shift local rows to range of [-PETSC_INT_MAX, -1] */
6480     else {
6481       PetscCheck(!mat->nooffprocentries, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "MAT_NO_OFF_PROC_ENTRIES is set but insert to remote rows");
6482       if (mpiaij->donotstash) i1[k] = PETSC_INT_MIN; /* Ignore offproc entries as if they had negative indices */
6483     }
6484   }
6485 
6486   /* Sort by row; after that, [0,k) have ignored entries, [k,rem) have local rows and [rem,n1) have remote rows */
6487   PetscCall(PetscSortIntWithIntCountArrayPair(n1, i1, j1, perm1));
6488 
6489   /* Advance k to the first entry we need to take care of */
6490   for (k = 0; k < n1; k++)
6491     if (i1[k] > PETSC_INT_MIN) break;
6492   PetscCount i1start = k;
6493 
6494   PetscCall(PetscSortedIntUpperBound(i1, k, n1, rend - 1 - PETSC_INT_MAX, &rem)); /* rem is upper bound of the last local row */
6495   for (; k < rem; k++) i1[k] += PETSC_INT_MAX;                                    /* Revert row indices of local rows*/
6496 
6497   /*           Send remote rows to their owner                                  */
6498   /* Find which rows should be sent to which remote ranks*/
6499   PetscInt        nsend = 0; /* Number of MPI ranks to send data to */
6500   PetscMPIInt    *sendto;    /* [nsend], storing remote ranks */
6501   PetscInt       *nentries;  /* [nsend], storing number of entries sent to remote ranks; Assume PetscInt is big enough for this count, and error if not */
6502   const PetscInt *ranges;
6503   PetscInt        maxNsend = size >= 128 ? 128 : size; /* Assume max 128 neighbors; realloc when needed */
6504 
6505   PetscCall(PetscLayoutGetRanges(mat->rmap, &ranges));
6506   PetscCall(PetscMalloc2(maxNsend, &sendto, maxNsend, &nentries));
6507   for (k = rem; k < n1;) {
6508     PetscMPIInt owner;
6509     PetscInt    firstRow, lastRow;
6510 
6511     /* Locate a row range */
6512     firstRow = i1[k]; /* first row of this owner */
6513     PetscCall(PetscLayoutFindOwner(mat->rmap, firstRow, &owner));
6514     lastRow = ranges[owner + 1] - 1; /* last row of this owner */
6515 
6516     /* Find the first index 'p' in [k,n) with i[p] belonging to next owner */
6517     PetscCall(PetscSortedIntUpperBound(i1, k, n1, lastRow, &p));
6518 
6519     /* All entries in [k,p) belong to this remote owner */
6520     if (nsend >= maxNsend) { /* Double the remote ranks arrays if not long enough */
6521       PetscMPIInt *sendto2;
6522       PetscInt    *nentries2;
6523       PetscInt     maxNsend2 = (maxNsend <= size / 2) ? maxNsend * 2 : size;
6524 
6525       PetscCall(PetscMalloc2(maxNsend2, &sendto2, maxNsend2, &nentries2));
6526       PetscCall(PetscArraycpy(sendto2, sendto, maxNsend));
6527       PetscCall(PetscArraycpy(nentries2, nentries2, maxNsend + 1));
6528       PetscCall(PetscFree2(sendto, nentries2));
6529       sendto   = sendto2;
6530       nentries = nentries2;
6531       maxNsend = maxNsend2;
6532     }
6533     sendto[nsend] = owner;
6534     PetscCall(PetscIntCast(p - k, &nentries[nsend]));
6535     nsend++;
6536     k = p;
6537   }
6538 
6539   /* Build 1st SF to know offsets on remote to send data */
6540   PetscSF      sf1;
6541   PetscInt     nroots = 1, nroots2 = 0;
6542   PetscInt     nleaves = nsend, nleaves2 = 0;
6543   PetscInt    *offsets;
6544   PetscSFNode *iremote;
6545 
6546   PetscCall(PetscSFCreate(comm, &sf1));
6547   PetscCall(PetscMalloc1(nsend, &iremote));
6548   PetscCall(PetscMalloc1(nsend, &offsets));
6549   for (k = 0; k < nsend; k++) {
6550     iremote[k].rank  = sendto[k];
6551     iremote[k].index = 0;
6552     nleaves2 += nentries[k];
6553     PetscCheck(nleaves2 >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Number of SF leaves is too large for PetscInt");
6554   }
6555   PetscCall(PetscSFSetGraph(sf1, nroots, nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
6556   PetscCall(PetscSFFetchAndOpWithMemTypeBegin(sf1, MPIU_INT, PETSC_MEMTYPE_HOST, &nroots2 /*rootdata*/, PETSC_MEMTYPE_HOST, nentries /*leafdata*/, PETSC_MEMTYPE_HOST, offsets /*leafupdate*/, MPI_SUM));
6557   PetscCall(PetscSFFetchAndOpEnd(sf1, MPIU_INT, &nroots2, nentries, offsets, MPI_SUM)); /* Would nroots2 overflow, we check offsets[] below */
6558   PetscCall(PetscSFDestroy(&sf1));
6559   PetscAssert(nleaves2 == n1 - rem, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nleaves2 %" PetscInt_FMT " != number of remote entries %" PetscCount_FMT, nleaves2, n1 - rem);
6560 
6561   /* Build 2nd SF to send remote COOs to their owner */
6562   PetscSF sf2;
6563   nroots  = nroots2;
6564   nleaves = nleaves2;
6565   PetscCall(PetscSFCreate(comm, &sf2));
6566   PetscCall(PetscSFSetFromOptions(sf2));
6567   PetscCall(PetscMalloc1(nleaves, &iremote));
6568   p = 0;
6569   for (k = 0; k < nsend; k++) {
6570     PetscCheck(offsets[k] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Number of SF roots is too large for PetscInt");
6571     for (q = 0; q < nentries[k]; q++, p++) {
6572       iremote[p].rank = sendto[k];
6573       PetscCall(PetscIntCast(offsets[k] + q, &iremote[p].index));
6574     }
6575   }
6576   PetscCall(PetscSFSetGraph(sf2, nroots, nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
6577 
6578   /* Send the remote COOs to their owner */
6579   PetscInt    n2 = nroots, *i2, *j2; /* Buffers for received COOs from other ranks, along with a permutation array */
6580   PetscCount *perm2;                 /* Though PetscInt is enough for remote entries, we use PetscCount here as we want to reuse MatSplitEntries_Internal() */
6581   PetscCall(PetscMalloc3(n2, &i2, n2, &j2, n2, &perm2));
6582   PetscAssert(rem == 0 || i1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6583   PetscAssert(rem == 0 || j1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6584   PetscInt *i1prem = PetscSafePointerPlusOffset(i1, rem);
6585   PetscInt *j1prem = PetscSafePointerPlusOffset(j1, rem);
6586   PetscCall(PetscSFReduceWithMemTypeBegin(sf2, MPIU_INT, PETSC_MEMTYPE_HOST, i1prem, PETSC_MEMTYPE_HOST, i2, MPI_REPLACE));
6587   PetscCall(PetscSFReduceEnd(sf2, MPIU_INT, i1prem, i2, MPI_REPLACE));
6588   PetscCall(PetscSFReduceWithMemTypeBegin(sf2, MPIU_INT, PETSC_MEMTYPE_HOST, j1prem, PETSC_MEMTYPE_HOST, j2, MPI_REPLACE));
6589   PetscCall(PetscSFReduceEnd(sf2, MPIU_INT, j1prem, j2, MPI_REPLACE));
6590 
6591   PetscCall(PetscFree(offsets));
6592   PetscCall(PetscFree2(sendto, nentries));
6593 
6594   /* Sort received COOs by row along with the permutation array     */
6595   for (k = 0; k < n2; k++) perm2[k] = k;
6596   PetscCall(PetscSortIntWithIntCountArrayPair(n2, i2, j2, perm2));
6597 
6598   /* sf2 only sends contiguous leafdata to contiguous rootdata. We record the permutation which will be used to fill leafdata */
6599   PetscCount *Cperm1;
6600   PetscAssert(rem == 0 || perm1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6601   PetscCount *perm1prem = PetscSafePointerPlusOffset(perm1, rem);
6602   PetscCall(PetscMalloc1(nleaves, &Cperm1));
6603   PetscCall(PetscArraycpy(Cperm1, perm1prem, nleaves));
6604 
6605   /* Support for HYPRE matrices, kind of a hack.
6606      Swap min column with diagonal so that diagonal values will go first */
6607   PetscBool hypre;
6608   PetscCall(PetscStrcmp("_internal_COO_mat_for_hypre", ((PetscObject)mat)->name, &hypre));
6609   if (hypre) {
6610     PetscInt *minj;
6611     PetscBT   hasdiag;
6612 
6613     PetscCall(PetscBTCreate(m, &hasdiag));
6614     PetscCall(PetscMalloc1(m, &minj));
6615     for (k = 0; k < m; k++) minj[k] = PETSC_INT_MAX;
6616     for (k = i1start; k < rem; k++) {
6617       if (j1[k] < cstart || j1[k] >= cend) continue;
6618       const PetscInt rindex = i1[k] - rstart;
6619       if ((j1[k] - cstart) == rindex) PetscCall(PetscBTSet(hasdiag, rindex));
6620       minj[rindex] = PetscMin(minj[rindex], j1[k]);
6621     }
6622     for (k = 0; k < n2; k++) {
6623       if (j2[k] < cstart || j2[k] >= cend) continue;
6624       const PetscInt rindex = i2[k] - rstart;
6625       if ((j2[k] - cstart) == rindex) PetscCall(PetscBTSet(hasdiag, rindex));
6626       minj[rindex] = PetscMin(minj[rindex], j2[k]);
6627     }
6628     for (k = i1start; k < rem; k++) {
6629       const PetscInt rindex = i1[k] - rstart;
6630       if (j1[k] < cstart || j1[k] >= cend || !PetscBTLookup(hasdiag, rindex)) continue;
6631       if (j1[k] == minj[rindex]) j1[k] = i1[k] + (cstart - rstart);
6632       else if ((j1[k] - cstart) == rindex) j1[k] = minj[rindex];
6633     }
6634     for (k = 0; k < n2; k++) {
6635       const PetscInt rindex = i2[k] - rstart;
6636       if (j2[k] < cstart || j2[k] >= cend || !PetscBTLookup(hasdiag, rindex)) continue;
6637       if (j2[k] == minj[rindex]) j2[k] = i2[k] + (cstart - rstart);
6638       else if ((j2[k] - cstart) == rindex) j2[k] = minj[rindex];
6639     }
6640     PetscCall(PetscBTDestroy(&hasdiag));
6641     PetscCall(PetscFree(minj));
6642   }
6643 
6644   /* Split local COOs and received COOs into diag/offdiag portions */
6645   PetscCount *rowBegin1, *rowMid1, *rowEnd1;
6646   PetscCount *Ajmap1, *Aperm1, *Bjmap1, *Bperm1;
6647   PetscCount  Annz1, Bnnz1, Atot1, Btot1;
6648   PetscCount *rowBegin2, *rowMid2, *rowEnd2;
6649   PetscCount *Ajmap2, *Aperm2, *Bjmap2, *Bperm2;
6650   PetscCount  Annz2, Bnnz2, Atot2, Btot2;
6651 
6652   PetscCall(PetscCalloc3(m, &rowBegin1, m, &rowMid1, m, &rowEnd1));
6653   PetscCall(PetscCalloc3(m, &rowBegin2, m, &rowMid2, m, &rowEnd2));
6654   PetscCall(MatSplitEntries_Internal(mat, rem, i1, j1, perm1, rowBegin1, rowMid1, rowEnd1, &Atot1, &Aperm1, &Annz1, &Ajmap1, &Btot1, &Bperm1, &Bnnz1, &Bjmap1));
6655   PetscCall(MatSplitEntries_Internal(mat, n2, i2, j2, perm2, rowBegin2, rowMid2, rowEnd2, &Atot2, &Aperm2, &Annz2, &Ajmap2, &Btot2, &Bperm2, &Bnnz2, &Bjmap2));
6656 
6657   /* Merge local COOs with received COOs: diag with diag, offdiag with offdiag */
6658   PetscInt *Ai, *Bi;
6659   PetscInt *Aj, *Bj;
6660 
6661   PetscCall(PetscMalloc1(m + 1, &Ai));
6662   PetscCall(PetscMalloc1(m + 1, &Bi));
6663   PetscCall(PetscMalloc1(Annz1 + Annz2, &Aj)); /* Since local and remote entries might have dups, we might allocate excess memory */
6664   PetscCall(PetscMalloc1(Bnnz1 + Bnnz2, &Bj));
6665 
6666   PetscCount *Aimap1, *Bimap1, *Aimap2, *Bimap2;
6667   PetscCall(PetscMalloc1(Annz1, &Aimap1));
6668   PetscCall(PetscMalloc1(Bnnz1, &Bimap1));
6669   PetscCall(PetscMalloc1(Annz2, &Aimap2));
6670   PetscCall(PetscMalloc1(Bnnz2, &Bimap2));
6671 
6672   PetscCall(MatMergeEntries_Internal(mat, j1, j2, rowBegin1, rowMid1, rowBegin2, rowMid2, Ajmap1, Ajmap2, Aimap1, Aimap2, Ai, Aj));
6673   PetscCall(MatMergeEntries_Internal(mat, j1, j2, rowMid1, rowEnd1, rowMid2, rowEnd2, Bjmap1, Bjmap2, Bimap1, Bimap2, Bi, Bj));
6674 
6675   /* Expand Ajmap1/Bjmap1 to make them based off nonzeros in A/B, since we     */
6676   /* expect nonzeros in A/B most likely have local contributing entries        */
6677   PetscInt    Annz = Ai[m];
6678   PetscInt    Bnnz = Bi[m];
6679   PetscCount *Ajmap1_new, *Bjmap1_new;
6680 
6681   PetscCall(PetscMalloc1(Annz + 1, &Ajmap1_new));
6682   PetscCall(PetscMalloc1(Bnnz + 1, &Bjmap1_new));
6683 
6684   PetscCall(ExpandJmap_Internal(Annz1, Annz, Aimap1, Ajmap1, Ajmap1_new));
6685   PetscCall(ExpandJmap_Internal(Bnnz1, Bnnz, Bimap1, Bjmap1, Bjmap1_new));
6686 
6687   PetscCall(PetscFree(Aimap1));
6688   PetscCall(PetscFree(Ajmap1));
6689   PetscCall(PetscFree(Bimap1));
6690   PetscCall(PetscFree(Bjmap1));
6691   PetscCall(PetscFree3(rowBegin1, rowMid1, rowEnd1));
6692   PetscCall(PetscFree3(rowBegin2, rowMid2, rowEnd2));
6693   PetscCall(PetscFree(perm1));
6694   PetscCall(PetscFree3(i2, j2, perm2));
6695 
6696   Ajmap1 = Ajmap1_new;
6697   Bjmap1 = Bjmap1_new;
6698 
6699   /* Reallocate Aj, Bj once we know actual numbers of unique nonzeros in A and B */
6700   if (Annz < Annz1 + Annz2) {
6701     PetscInt *Aj_new;
6702     PetscCall(PetscMalloc1(Annz, &Aj_new));
6703     PetscCall(PetscArraycpy(Aj_new, Aj, Annz));
6704     PetscCall(PetscFree(Aj));
6705     Aj = Aj_new;
6706   }
6707 
6708   if (Bnnz < Bnnz1 + Bnnz2) {
6709     PetscInt *Bj_new;
6710     PetscCall(PetscMalloc1(Bnnz, &Bj_new));
6711     PetscCall(PetscArraycpy(Bj_new, Bj, Bnnz));
6712     PetscCall(PetscFree(Bj));
6713     Bj = Bj_new;
6714   }
6715 
6716   /* Create new submatrices for on-process and off-process coupling                  */
6717   PetscScalar     *Aa, *Ba;
6718   MatType          rtype;
6719   Mat_SeqAIJ      *a, *b;
6720   PetscObjectState state;
6721   PetscCall(PetscCalloc1(Annz, &Aa)); /* Zero matrix on device */
6722   PetscCall(PetscCalloc1(Bnnz, &Ba));
6723   /* make Aj[] local, i.e, based off the start column of the diagonal portion */
6724   if (cstart) {
6725     for (k = 0; k < Annz; k++) Aj[k] -= cstart;
6726   }
6727 
6728   PetscCall(MatGetRootType_Private(mat, &rtype));
6729 
6730   MatSeqXAIJGetOptions_Private(mpiaij->A);
6731   PetscCall(MatDestroy(&mpiaij->A));
6732   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, n, Ai, Aj, Aa, &mpiaij->A));
6733   PetscCall(MatSetBlockSizesFromMats(mpiaij->A, mat, mat));
6734   MatSeqXAIJRestoreOptions_Private(mpiaij->A);
6735 
6736   MatSeqXAIJGetOptions_Private(mpiaij->B);
6737   PetscCall(MatDestroy(&mpiaij->B));
6738   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, mat->cmap->N, Bi, Bj, Ba, &mpiaij->B));
6739   PetscCall(MatSetBlockSizesFromMats(mpiaij->B, mat, mat));
6740   MatSeqXAIJRestoreOptions_Private(mpiaij->B);
6741 
6742   PetscCall(MatSetUpMultiply_MPIAIJ(mat));
6743   mat->was_assembled = PETSC_TRUE; // was_assembled in effect means the Mvctx is built; doing so avoids redundant MatSetUpMultiply_MPIAIJ
6744   state              = mpiaij->A->nonzerostate + mpiaij->B->nonzerostate;
6745   PetscCallMPI(MPIU_Allreduce(&state, &mat->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)mat)));
6746 
6747   a          = (Mat_SeqAIJ *)mpiaij->A->data;
6748   b          = (Mat_SeqAIJ *)mpiaij->B->data;
6749   a->free_a  = PETSC_TRUE;
6750   a->free_ij = PETSC_TRUE;
6751   b->free_a  = PETSC_TRUE;
6752   b->free_ij = PETSC_TRUE;
6753   a->maxnz   = a->nz;
6754   b->maxnz   = b->nz;
6755 
6756   /* conversion must happen AFTER multiply setup */
6757   PetscCall(MatConvert(mpiaij->A, rtype, MAT_INPLACE_MATRIX, &mpiaij->A));
6758   PetscCall(MatConvert(mpiaij->B, rtype, MAT_INPLACE_MATRIX, &mpiaij->B));
6759   PetscCall(VecDestroy(&mpiaij->lvec));
6760   PetscCall(MatCreateVecs(mpiaij->B, &mpiaij->lvec, NULL));
6761 
6762   // Put the COO struct in a container and then attach that to the matrix
6763   PetscCall(PetscMalloc1(1, &coo));
6764   coo->n       = coo_n;
6765   coo->sf      = sf2;
6766   coo->sendlen = nleaves;
6767   coo->recvlen = nroots;
6768   coo->Annz    = Annz;
6769   coo->Bnnz    = Bnnz;
6770   coo->Annz2   = Annz2;
6771   coo->Bnnz2   = Bnnz2;
6772   coo->Atot1   = Atot1;
6773   coo->Atot2   = Atot2;
6774   coo->Btot1   = Btot1;
6775   coo->Btot2   = Btot2;
6776   coo->Ajmap1  = Ajmap1;
6777   coo->Aperm1  = Aperm1;
6778   coo->Bjmap1  = Bjmap1;
6779   coo->Bperm1  = Bperm1;
6780   coo->Aimap2  = Aimap2;
6781   coo->Ajmap2  = Ajmap2;
6782   coo->Aperm2  = Aperm2;
6783   coo->Bimap2  = Bimap2;
6784   coo->Bjmap2  = Bjmap2;
6785   coo->Bperm2  = Bperm2;
6786   coo->Cperm1  = Cperm1;
6787   // Allocate in preallocation. If not used, it has zero cost on host
6788   PetscCall(PetscMalloc2(coo->sendlen, &coo->sendbuf, coo->recvlen, &coo->recvbuf));
6789   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container));
6790   PetscCall(PetscContainerSetPointer(container, coo));
6791   PetscCall(PetscContainerSetCtxDestroy(container, MatCOOStructDestroy_MPIAIJ));
6792   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject)container));
6793   PetscCall(PetscContainerDestroy(&container));
6794   PetscFunctionReturn(PETSC_SUCCESS);
6795 }
6796 
6797 static PetscErrorCode MatSetValuesCOO_MPIAIJ(Mat mat, const PetscScalar v[], InsertMode imode)
6798 {
6799   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
6800   Mat                  A = mpiaij->A, B = mpiaij->B;
6801   PetscScalar         *Aa, *Ba;
6802   PetscScalar         *sendbuf, *recvbuf;
6803   const PetscCount    *Ajmap1, *Ajmap2, *Aimap2;
6804   const PetscCount    *Bjmap1, *Bjmap2, *Bimap2;
6805   const PetscCount    *Aperm1, *Aperm2, *Bperm1, *Bperm2;
6806   const PetscCount    *Cperm1;
6807   PetscContainer       container;
6808   MatCOOStruct_MPIAIJ *coo;
6809 
6810   PetscFunctionBegin;
6811   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container));
6812   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
6813   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
6814   sendbuf = coo->sendbuf;
6815   recvbuf = coo->recvbuf;
6816   Ajmap1  = coo->Ajmap1;
6817   Ajmap2  = coo->Ajmap2;
6818   Aimap2  = coo->Aimap2;
6819   Bjmap1  = coo->Bjmap1;
6820   Bjmap2  = coo->Bjmap2;
6821   Bimap2  = coo->Bimap2;
6822   Aperm1  = coo->Aperm1;
6823   Aperm2  = coo->Aperm2;
6824   Bperm1  = coo->Bperm1;
6825   Bperm2  = coo->Bperm2;
6826   Cperm1  = coo->Cperm1;
6827 
6828   PetscCall(MatSeqAIJGetArray(A, &Aa)); /* Might read and write matrix values */
6829   PetscCall(MatSeqAIJGetArray(B, &Ba));
6830 
6831   /* Pack entries to be sent to remote */
6832   for (PetscCount i = 0; i < coo->sendlen; i++) sendbuf[i] = v[Cperm1[i]];
6833 
6834   /* Send remote entries to their owner and overlap the communication with local computation */
6835   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_HOST, sendbuf, PETSC_MEMTYPE_HOST, recvbuf, MPI_REPLACE));
6836   /* Add local entries to A and B */
6837   for (PetscCount i = 0; i < coo->Annz; i++) { /* All nonzeros in A are either zero'ed or added with a value (i.e., initialized) */
6838     PetscScalar sum = 0.0;                     /* Do partial summation first to improve numerical stability */
6839     for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += v[Aperm1[k]];
6840     Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
6841   }
6842   for (PetscCount i = 0; i < coo->Bnnz; i++) {
6843     PetscScalar sum = 0.0;
6844     for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += v[Bperm1[k]];
6845     Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
6846   }
6847   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, sendbuf, recvbuf, MPI_REPLACE));
6848 
6849   /* Add received remote entries to A and B */
6850   for (PetscCount i = 0; i < coo->Annz2; i++) {
6851     for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += recvbuf[Aperm2[k]];
6852   }
6853   for (PetscCount i = 0; i < coo->Bnnz2; i++) {
6854     for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += recvbuf[Bperm2[k]];
6855   }
6856   PetscCall(MatSeqAIJRestoreArray(A, &Aa));
6857   PetscCall(MatSeqAIJRestoreArray(B, &Ba));
6858   PetscFunctionReturn(PETSC_SUCCESS);
6859 }
6860 
6861 /*MC
6862    MATMPIAIJ - MATMPIAIJ = "mpiaij" - A matrix type to be used for parallel sparse matrices.
6863 
6864    Options Database Keys:
6865 . -mat_type mpiaij - sets the matrix type to `MATMPIAIJ` during a call to `MatSetFromOptions()`
6866 
6867    Level: beginner
6868 
6869    Notes:
6870    `MatSetValues()` may be called for this matrix type with a `NULL` argument for the numerical values,
6871     in this case the values associated with the rows and columns one passes in are set to zero
6872     in the matrix
6873 
6874     `MatSetOptions`(,`MAT_STRUCTURE_ONLY`,`PETSC_TRUE`) may be called for this matrix type. In this no
6875     space is allocated for the nonzero entries and any entries passed with `MatSetValues()` are ignored
6876 
6877 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJ`, `MATAIJ`, `MatCreateAIJ()`
6878 M*/
6879 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJ(Mat B)
6880 {
6881   Mat_MPIAIJ *b;
6882   PetscMPIInt size;
6883 
6884   PetscFunctionBegin;
6885   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
6886 
6887   PetscCall(PetscNew(&b));
6888   B->data       = (void *)b;
6889   B->ops[0]     = MatOps_Values;
6890   B->assembled  = PETSC_FALSE;
6891   B->insertmode = NOT_SET_VALUES;
6892   b->size       = size;
6893 
6894   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)B), &b->rank));
6895 
6896   /* build cache for off array entries formed */
6897   PetscCall(MatStashCreate_Private(PetscObjectComm((PetscObject)B), 1, &B->stash));
6898 
6899   b->donotstash  = PETSC_FALSE;
6900   b->colmap      = NULL;
6901   b->garray      = NULL;
6902   b->roworiented = PETSC_TRUE;
6903 
6904   /* stuff used for matrix vector multiply */
6905   b->lvec  = NULL;
6906   b->Mvctx = NULL;
6907 
6908   /* stuff for MatGetRow() */
6909   b->rowindices   = NULL;
6910   b->rowvalues    = NULL;
6911   b->getrowactive = PETSC_FALSE;
6912 
6913   /* flexible pointer used in CUSPARSE classes */
6914   b->spptr = NULL;
6915 
6916   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetUseScalableIncreaseOverlap_C", MatMPIAIJSetUseScalableIncreaseOverlap_MPIAIJ));
6917   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_MPIAIJ));
6918   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_MPIAIJ));
6919   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatIsTranspose_C", MatIsTranspose_MPIAIJ));
6920   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJ));
6921   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatResetPreallocation_C", MatResetPreallocation_MPIAIJ));
6922   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatResetHash_C", MatResetHash_MPIAIJ));
6923   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocationCSR_C", MatMPIAIJSetPreallocationCSR_MPIAIJ));
6924   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDiagonalScaleLocal_C", MatDiagonalScaleLocal_MPIAIJ));
6925   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijperm_C", MatConvert_MPIAIJ_MPIAIJPERM));
6926   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijsell_C", MatConvert_MPIAIJ_MPIAIJSELL));
6927 #if defined(PETSC_HAVE_CUDA)
6928   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijcusparse_C", MatConvert_MPIAIJ_MPIAIJCUSPARSE));
6929 #endif
6930 #if defined(PETSC_HAVE_HIP)
6931   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijhipsparse_C", MatConvert_MPIAIJ_MPIAIJHIPSPARSE));
6932 #endif
6933 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
6934   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijkokkos_C", MatConvert_MPIAIJ_MPIAIJKokkos));
6935 #endif
6936 #if defined(PETSC_HAVE_MKL_SPARSE)
6937   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijmkl_C", MatConvert_MPIAIJ_MPIAIJMKL));
6938 #endif
6939   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijcrl_C", MatConvert_MPIAIJ_MPIAIJCRL));
6940   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpibaij_C", MatConvert_MPIAIJ_MPIBAIJ));
6941   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpisbaij_C", MatConvert_MPIAIJ_MPISBAIJ));
6942   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpidense_C", MatConvert_MPIAIJ_MPIDense));
6943 #if defined(PETSC_HAVE_ELEMENTAL)
6944   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_elemental_C", MatConvert_MPIAIJ_Elemental));
6945 #endif
6946 #if defined(PETSC_HAVE_SCALAPACK)
6947   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_scalapack_C", MatConvert_AIJ_ScaLAPACK));
6948 #endif
6949   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_is_C", MatConvert_XAIJ_IS));
6950   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpisell_C", MatConvert_MPIAIJ_MPISELL));
6951 #if defined(PETSC_HAVE_HYPRE)
6952   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_hypre_C", MatConvert_AIJ_HYPRE));
6953   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_transpose_mpiaij_mpiaij_C", MatProductSetFromOptions_Transpose_AIJ_AIJ));
6954 #endif
6955   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_is_mpiaij_C", MatProductSetFromOptions_IS_XAIJ));
6956   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_mpiaij_mpiaij_C", MatProductSetFromOptions_MPIAIJ));
6957   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJ));
6958   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJ));
6959   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATMPIAIJ));
6960   PetscFunctionReturn(PETSC_SUCCESS);
6961 }
6962 
6963 /*@
6964   MatCreateMPIAIJWithSplitArrays - creates a `MATMPIAIJ` matrix using arrays that contain the "diagonal"
6965   and "off-diagonal" part of the matrix in CSR format.
6966 
6967   Collective
6968 
6969   Input Parameters:
6970 + comm - MPI communicator
6971 . m    - number of local rows (Cannot be `PETSC_DECIDE`)
6972 . n    - This value should be the same as the local size used in creating the
6973          x vector for the matrix-vector product $y = Ax$. (or `PETSC_DECIDE` to have
6974          calculated if `N` is given) For square matrices `n` is almost always `m`.
6975 . M    - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
6976 . N    - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
6977 . i    - row indices for "diagonal" portion of matrix; that is i[0] = 0, i[row] = i[row-1] + number of elements in that row of the matrix
6978 . j    - column indices, which must be local, i.e., based off the start column of the diagonal portion
6979 . a    - matrix values
6980 . oi   - row indices for "off-diagonal" portion of matrix; that is oi[0] = 0, oi[row] = oi[row-1] + number of elements in that row of the matrix
6981 . oj   - column indices, which must be global, representing global columns in the `MATMPIAIJ` matrix
6982 - oa   - matrix values
6983 
6984   Output Parameter:
6985 . mat - the matrix
6986 
6987   Level: advanced
6988 
6989   Notes:
6990   The `i`, `j`, and `a` arrays ARE NOT copied by this routine into the internal format used by PETSc (even in Fortran). The user
6991   must free the arrays once the matrix has been destroyed and not before.
6992 
6993   The `i` and `j` indices are 0 based
6994 
6995   See `MatCreateAIJ()` for the definition of "diagonal" and "off-diagonal" portion of the matrix
6996 
6997   This sets local rows and cannot be used to set off-processor values.
6998 
6999   Use of this routine is discouraged because it is inflexible and cumbersome to use. It is extremely rare that a
7000   legacy application natively assembles into exactly this split format. The code to do so is nontrivial and does
7001   not easily support in-place reassembly. It is recommended to use MatSetValues() (or a variant thereof) because
7002   the resulting assembly is easier to implement, will work with any matrix format, and the user does not have to
7003   keep track of the underlying array. Use `MatSetOption`(A,`MAT_NO_OFF_PROC_ENTRIES`,`PETSC_TRUE`) to disable all
7004   communication if it is known that only local entries will be set.
7005 
7006 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
7007           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithArrays()`
7008 @*/
7009 PetscErrorCode MatCreateMPIAIJWithSplitArrays(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt i[], PetscInt j[], PetscScalar a[], PetscInt oi[], PetscInt oj[], PetscScalar oa[], Mat *mat)
7010 {
7011   Mat_MPIAIJ *maij;
7012 
7013   PetscFunctionBegin;
7014   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
7015   PetscCheck(i[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
7016   PetscCheck(oi[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "oi (row indices) must start with 0");
7017   PetscCall(MatCreate(comm, mat));
7018   PetscCall(MatSetSizes(*mat, m, n, M, N));
7019   PetscCall(MatSetType(*mat, MATMPIAIJ));
7020   maij = (Mat_MPIAIJ *)(*mat)->data;
7021 
7022   (*mat)->preallocated = PETSC_TRUE;
7023 
7024   PetscCall(PetscLayoutSetUp((*mat)->rmap));
7025   PetscCall(PetscLayoutSetUp((*mat)->cmap));
7026 
7027   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, n, i, j, a, &maij->A));
7028   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, (*mat)->cmap->N, oi, oj, oa, &maij->B));
7029 
7030   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
7031   PetscCall(MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY));
7032   PetscCall(MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY));
7033   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE));
7034   PetscCall(MatSetOption(*mat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
7035   PetscFunctionReturn(PETSC_SUCCESS);
7036 }
7037 
7038 typedef struct {
7039   Mat       *mp;    /* intermediate products */
7040   PetscBool *mptmp; /* is the intermediate product temporary ? */
7041   PetscInt   cp;    /* number of intermediate products */
7042 
7043   /* support for MatGetBrowsOfAoCols_MPIAIJ for P_oth */
7044   PetscInt    *startsj_s, *startsj_r;
7045   PetscScalar *bufa;
7046   Mat          P_oth;
7047 
7048   /* may take advantage of merging product->B */
7049   Mat Bloc; /* B-local by merging diag and off-diag */
7050 
7051   /* cusparse does not have support to split between symbolic and numeric phases.
7052      When api_user is true, we don't need to update the numerical values
7053      of the temporary storage */
7054   PetscBool reusesym;
7055 
7056   /* support for COO values insertion */
7057   PetscScalar *coo_v, *coo_w; /* store on-process and off-process COO scalars, and used as MPI recv/send buffers respectively */
7058   PetscInt   **own;           /* own[i] points to address of on-process COO indices for Mat mp[i] */
7059   PetscInt   **off;           /* off[i] points to address of off-process COO indices for Mat mp[i] */
7060   PetscBool    hasoffproc;    /* if true, have off-process values insertion (i.e. AtB or PtAP) */
7061   PetscSF      sf;            /* used for non-local values insertion and memory malloc */
7062   PetscMemType mtype;
7063 
7064   /* customization */
7065   PetscBool abmerge;
7066   PetscBool P_oth_bind;
7067 } MatMatMPIAIJBACKEND;
7068 
7069 static PetscErrorCode MatDestroy_MatMatMPIAIJBACKEND(void *data)
7070 {
7071   MatMatMPIAIJBACKEND *mmdata = (MatMatMPIAIJBACKEND *)data;
7072   PetscInt             i;
7073 
7074   PetscFunctionBegin;
7075   PetscCall(PetscFree2(mmdata->startsj_s, mmdata->startsj_r));
7076   PetscCall(PetscFree(mmdata->bufa));
7077   PetscCall(PetscSFFree(mmdata->sf, mmdata->mtype, mmdata->coo_v));
7078   PetscCall(PetscSFFree(mmdata->sf, mmdata->mtype, mmdata->coo_w));
7079   PetscCall(MatDestroy(&mmdata->P_oth));
7080   PetscCall(MatDestroy(&mmdata->Bloc));
7081   PetscCall(PetscSFDestroy(&mmdata->sf));
7082   for (i = 0; i < mmdata->cp; i++) PetscCall(MatDestroy(&mmdata->mp[i]));
7083   PetscCall(PetscFree2(mmdata->mp, mmdata->mptmp));
7084   PetscCall(PetscFree(mmdata->own[0]));
7085   PetscCall(PetscFree(mmdata->own));
7086   PetscCall(PetscFree(mmdata->off[0]));
7087   PetscCall(PetscFree(mmdata->off));
7088   PetscCall(PetscFree(mmdata));
7089   PetscFunctionReturn(PETSC_SUCCESS);
7090 }
7091 
7092 /* Copy selected n entries with indices in idx[] of A to v[].
7093    If idx is NULL, copy the whole data array of A to v[]
7094  */
7095 static PetscErrorCode MatSeqAIJCopySubArray(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
7096 {
7097   PetscErrorCode (*f)(Mat, PetscInt, const PetscInt[], PetscScalar[]);
7098 
7099   PetscFunctionBegin;
7100   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", &f));
7101   if (f) {
7102     PetscCall((*f)(A, n, idx, v));
7103   } else {
7104     const PetscScalar *vv;
7105 
7106     PetscCall(MatSeqAIJGetArrayRead(A, &vv));
7107     if (n && idx) {
7108       PetscScalar    *w  = v;
7109       const PetscInt *oi = idx;
7110       PetscInt        j;
7111 
7112       for (j = 0; j < n; j++) *w++ = vv[*oi++];
7113     } else {
7114       PetscCall(PetscArraycpy(v, vv, n));
7115     }
7116     PetscCall(MatSeqAIJRestoreArrayRead(A, &vv));
7117   }
7118   PetscFunctionReturn(PETSC_SUCCESS);
7119 }
7120 
7121 static PetscErrorCode MatProductNumeric_MPIAIJBACKEND(Mat C)
7122 {
7123   MatMatMPIAIJBACKEND *mmdata;
7124   PetscInt             i, n_d, n_o;
7125 
7126   PetscFunctionBegin;
7127   MatCheckProduct(C, 1);
7128   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
7129   mmdata = (MatMatMPIAIJBACKEND *)C->product->data;
7130   if (!mmdata->reusesym) { /* update temporary matrices */
7131     if (mmdata->P_oth) PetscCall(MatGetBrowsOfAoCols_MPIAIJ(C->product->A, C->product->B, MAT_REUSE_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7132     if (mmdata->Bloc) PetscCall(MatMPIAIJGetLocalMatMerge(C->product->B, MAT_REUSE_MATRIX, NULL, &mmdata->Bloc));
7133   }
7134   mmdata->reusesym = PETSC_FALSE;
7135 
7136   for (i = 0; i < mmdata->cp; i++) {
7137     PetscCheck(mmdata->mp[i]->ops->productnumeric, PetscObjectComm((PetscObject)mmdata->mp[i]), PETSC_ERR_PLIB, "Missing numeric op for %s", MatProductTypes[mmdata->mp[i]->product->type]);
7138     PetscCall((*mmdata->mp[i]->ops->productnumeric)(mmdata->mp[i]));
7139   }
7140   for (i = 0, n_d = 0, n_o = 0; i < mmdata->cp; i++) {
7141     PetscInt noff;
7142 
7143     PetscCall(PetscIntCast(mmdata->off[i + 1] - mmdata->off[i], &noff));
7144     if (mmdata->mptmp[i]) continue;
7145     if (noff) {
7146       PetscInt nown;
7147 
7148       PetscCall(PetscIntCast(mmdata->own[i + 1] - mmdata->own[i], &nown));
7149       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], noff, mmdata->off[i], mmdata->coo_w + n_o));
7150       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], nown, mmdata->own[i], mmdata->coo_v + n_d));
7151       n_o += noff;
7152       n_d += nown;
7153     } else {
7154       Mat_SeqAIJ *mm = (Mat_SeqAIJ *)mmdata->mp[i]->data;
7155 
7156       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], mm->nz, NULL, mmdata->coo_v + n_d));
7157       n_d += mm->nz;
7158     }
7159   }
7160   if (mmdata->hasoffproc) { /* offprocess insertion */
7161     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_SCALAR, mmdata->coo_w, mmdata->coo_v + n_d));
7162     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_SCALAR, mmdata->coo_w, mmdata->coo_v + n_d));
7163   }
7164   PetscCall(MatSetValuesCOO(C, mmdata->coo_v, INSERT_VALUES));
7165   PetscFunctionReturn(PETSC_SUCCESS);
7166 }
7167 
7168 /* Support for Pt * A, A * P, or Pt * A * P */
7169 #define MAX_NUMBER_INTERMEDIATE 4
7170 PetscErrorCode MatProductSymbolic_MPIAIJBACKEND(Mat C)
7171 {
7172   Mat_Product           *product = C->product;
7173   Mat                    A, P, mp[MAX_NUMBER_INTERMEDIATE]; /* A, P and a series of intermediate matrices */
7174   Mat_MPIAIJ            *a, *p;
7175   MatMatMPIAIJBACKEND   *mmdata;
7176   ISLocalToGlobalMapping P_oth_l2g = NULL;
7177   IS                     glob      = NULL;
7178   const char            *prefix;
7179   char                   pprefix[256];
7180   const PetscInt        *globidx, *P_oth_idx;
7181   PetscInt               i, j, cp, m, n, M, N, *coo_i, *coo_j;
7182   PetscCount             ncoo, ncoo_d, ncoo_o, ncoo_oown;
7183   PetscInt               cmapt[MAX_NUMBER_INTERMEDIATE], rmapt[MAX_NUMBER_INTERMEDIATE]; /* col/row map type for each Mat in mp[]. */
7184                                                                                          /* type-0: consecutive, start from 0; type-1: consecutive with */
7185                                                                                          /* a base offset; type-2: sparse with a local to global map table */
7186   const PetscInt *cmapa[MAX_NUMBER_INTERMEDIATE], *rmapa[MAX_NUMBER_INTERMEDIATE];       /* col/row local to global map array (table) for type-2 map type */
7187 
7188   MatProductType ptype;
7189   PetscBool      mptmp[MAX_NUMBER_INTERMEDIATE], hasoffproc = PETSC_FALSE, iscuda, iship, iskokk;
7190   PetscMPIInt    size;
7191 
7192   PetscFunctionBegin;
7193   MatCheckProduct(C, 1);
7194   PetscCheck(!product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty");
7195   ptype = product->type;
7196   if (product->A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
7197     ptype                                          = MATPRODUCT_AB;
7198     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
7199   }
7200   switch (ptype) {
7201   case MATPRODUCT_AB:
7202     A          = product->A;
7203     P          = product->B;
7204     m          = A->rmap->n;
7205     n          = P->cmap->n;
7206     M          = A->rmap->N;
7207     N          = P->cmap->N;
7208     hasoffproc = PETSC_FALSE; /* will not scatter mat product values to other processes */
7209     break;
7210   case MATPRODUCT_AtB:
7211     P          = product->A;
7212     A          = product->B;
7213     m          = P->cmap->n;
7214     n          = A->cmap->n;
7215     M          = P->cmap->N;
7216     N          = A->cmap->N;
7217     hasoffproc = PETSC_TRUE;
7218     break;
7219   case MATPRODUCT_PtAP:
7220     A          = product->A;
7221     P          = product->B;
7222     m          = P->cmap->n;
7223     n          = P->cmap->n;
7224     M          = P->cmap->N;
7225     N          = P->cmap->N;
7226     hasoffproc = PETSC_TRUE;
7227     break;
7228   default:
7229     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
7230   }
7231   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)C), &size));
7232   if (size == 1) hasoffproc = PETSC_FALSE;
7233 
7234   /* defaults */
7235   for (i = 0; i < MAX_NUMBER_INTERMEDIATE; i++) {
7236     mp[i]    = NULL;
7237     mptmp[i] = PETSC_FALSE;
7238     rmapt[i] = -1;
7239     cmapt[i] = -1;
7240     rmapa[i] = NULL;
7241     cmapa[i] = NULL;
7242   }
7243 
7244   /* customization */
7245   PetscCall(PetscNew(&mmdata));
7246   mmdata->reusesym = product->api_user;
7247   if (ptype == MATPRODUCT_AB) {
7248     if (product->api_user) {
7249       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
7250       PetscCall(PetscOptionsBool("-matmatmult_backend_mergeB", "Merge product->B local matrices", "MatMatMult", mmdata->abmerge, &mmdata->abmerge, NULL));
7251       PetscCall(PetscOptionsBool("-matmatmult_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7252       PetscOptionsEnd();
7253     } else {
7254       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
7255       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_mergeB", "Merge product->B local matrices", "MatMatMult", mmdata->abmerge, &mmdata->abmerge, NULL));
7256       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7257       PetscOptionsEnd();
7258     }
7259   } else if (ptype == MATPRODUCT_PtAP) {
7260     if (product->api_user) {
7261       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
7262       PetscCall(PetscOptionsBool("-matptap_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7263       PetscOptionsEnd();
7264     } else {
7265       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
7266       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7267       PetscOptionsEnd();
7268     }
7269   }
7270   a = (Mat_MPIAIJ *)A->data;
7271   p = (Mat_MPIAIJ *)P->data;
7272   PetscCall(MatSetSizes(C, m, n, M, N));
7273   PetscCall(PetscLayoutSetUp(C->rmap));
7274   PetscCall(PetscLayoutSetUp(C->cmap));
7275   PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
7276   PetscCall(MatGetOptionsPrefix(C, &prefix));
7277 
7278   cp = 0;
7279   switch (ptype) {
7280   case MATPRODUCT_AB: /* A * P */
7281     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7282 
7283     /* A_diag * P_local (merged or not) */
7284     if (mmdata->abmerge) { /* P's diagonal and off-diag blocks are merged to one matrix, then multiplied by A_diag */
7285       /* P is product->B */
7286       PetscCall(MatMPIAIJGetLocalMatMerge(P, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7287       PetscCall(MatProductCreate(a->A, mmdata->Bloc, NULL, &mp[cp]));
7288       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7289       PetscCall(MatProductSetFill(mp[cp], product->fill));
7290       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7291       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7292       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7293       mp[cp]->product->api_user = product->api_user;
7294       PetscCall(MatProductSetFromOptions(mp[cp]));
7295       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7296       PetscCall(ISGetIndices(glob, &globidx));
7297       rmapt[cp] = 1;
7298       cmapt[cp] = 2;
7299       cmapa[cp] = globidx;
7300       mptmp[cp] = PETSC_FALSE;
7301       cp++;
7302     } else { /* A_diag * P_diag and A_diag * P_off */
7303       PetscCall(MatProductCreate(a->A, p->A, NULL, &mp[cp]));
7304       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7305       PetscCall(MatProductSetFill(mp[cp], product->fill));
7306       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7307       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7308       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7309       mp[cp]->product->api_user = product->api_user;
7310       PetscCall(MatProductSetFromOptions(mp[cp]));
7311       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7312       rmapt[cp] = 1;
7313       cmapt[cp] = 1;
7314       mptmp[cp] = PETSC_FALSE;
7315       cp++;
7316       PetscCall(MatProductCreate(a->A, p->B, NULL, &mp[cp]));
7317       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7318       PetscCall(MatProductSetFill(mp[cp], product->fill));
7319       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7320       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7321       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7322       mp[cp]->product->api_user = product->api_user;
7323       PetscCall(MatProductSetFromOptions(mp[cp]));
7324       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7325       rmapt[cp] = 1;
7326       cmapt[cp] = 2;
7327       cmapa[cp] = p->garray;
7328       mptmp[cp] = PETSC_FALSE;
7329       cp++;
7330     }
7331 
7332     /* A_off * P_other */
7333     if (mmdata->P_oth) {
7334       PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth, &P_oth_l2g)); /* make P_oth use local col ids */
7335       PetscCall(ISLocalToGlobalMappingGetIndices(P_oth_l2g, &P_oth_idx));
7336       PetscCall(MatSetType(mmdata->P_oth, ((PetscObject)a->B)->type_name));
7337       PetscCall(MatBindToCPU(mmdata->P_oth, mmdata->P_oth_bind));
7338       PetscCall(MatProductCreate(a->B, mmdata->P_oth, NULL, &mp[cp]));
7339       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7340       PetscCall(MatProductSetFill(mp[cp], product->fill));
7341       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7342       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7343       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7344       mp[cp]->product->api_user = product->api_user;
7345       PetscCall(MatProductSetFromOptions(mp[cp]));
7346       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7347       rmapt[cp] = 1;
7348       cmapt[cp] = 2;
7349       cmapa[cp] = P_oth_idx;
7350       mptmp[cp] = PETSC_FALSE;
7351       cp++;
7352     }
7353     break;
7354 
7355   case MATPRODUCT_AtB: /* (P^t * A): P_diag * A_loc + P_off * A_loc */
7356     /* A is product->B */
7357     PetscCall(MatMPIAIJGetLocalMatMerge(A, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7358     if (A == P) { /* when A==P, we can take advantage of the already merged mmdata->Bloc */
7359       PetscCall(MatProductCreate(mmdata->Bloc, mmdata->Bloc, NULL, &mp[cp]));
7360       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7361       PetscCall(MatProductSetFill(mp[cp], product->fill));
7362       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7363       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7364       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7365       mp[cp]->product->api_user = product->api_user;
7366       PetscCall(MatProductSetFromOptions(mp[cp]));
7367       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7368       PetscCall(ISGetIndices(glob, &globidx));
7369       rmapt[cp] = 2;
7370       rmapa[cp] = globidx;
7371       cmapt[cp] = 2;
7372       cmapa[cp] = globidx;
7373       mptmp[cp] = PETSC_FALSE;
7374       cp++;
7375     } else {
7376       PetscCall(MatProductCreate(p->A, mmdata->Bloc, NULL, &mp[cp]));
7377       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7378       PetscCall(MatProductSetFill(mp[cp], product->fill));
7379       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7380       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7381       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7382       mp[cp]->product->api_user = product->api_user;
7383       PetscCall(MatProductSetFromOptions(mp[cp]));
7384       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7385       PetscCall(ISGetIndices(glob, &globidx));
7386       rmapt[cp] = 1;
7387       cmapt[cp] = 2;
7388       cmapa[cp] = globidx;
7389       mptmp[cp] = PETSC_FALSE;
7390       cp++;
7391       PetscCall(MatProductCreate(p->B, mmdata->Bloc, NULL, &mp[cp]));
7392       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7393       PetscCall(MatProductSetFill(mp[cp], product->fill));
7394       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7395       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7396       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7397       mp[cp]->product->api_user = product->api_user;
7398       PetscCall(MatProductSetFromOptions(mp[cp]));
7399       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7400       rmapt[cp] = 2;
7401       rmapa[cp] = p->garray;
7402       cmapt[cp] = 2;
7403       cmapa[cp] = globidx;
7404       mptmp[cp] = PETSC_FALSE;
7405       cp++;
7406     }
7407     break;
7408   case MATPRODUCT_PtAP:
7409     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7410     /* P is product->B */
7411     PetscCall(MatMPIAIJGetLocalMatMerge(P, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7412     PetscCall(MatProductCreate(a->A, mmdata->Bloc, NULL, &mp[cp]));
7413     PetscCall(MatProductSetType(mp[cp], MATPRODUCT_PtAP));
7414     PetscCall(MatProductSetFill(mp[cp], product->fill));
7415     PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7416     PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7417     PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7418     mp[cp]->product->api_user = product->api_user;
7419     PetscCall(MatProductSetFromOptions(mp[cp]));
7420     PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7421     PetscCall(ISGetIndices(glob, &globidx));
7422     rmapt[cp] = 2;
7423     rmapa[cp] = globidx;
7424     cmapt[cp] = 2;
7425     cmapa[cp] = globidx;
7426     mptmp[cp] = PETSC_FALSE;
7427     cp++;
7428     if (mmdata->P_oth) {
7429       PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth, &P_oth_l2g));
7430       PetscCall(ISLocalToGlobalMappingGetIndices(P_oth_l2g, &P_oth_idx));
7431       PetscCall(MatSetType(mmdata->P_oth, ((PetscObject)a->B)->type_name));
7432       PetscCall(MatBindToCPU(mmdata->P_oth, mmdata->P_oth_bind));
7433       PetscCall(MatProductCreate(a->B, mmdata->P_oth, NULL, &mp[cp]));
7434       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7435       PetscCall(MatProductSetFill(mp[cp], product->fill));
7436       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7437       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7438       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7439       mp[cp]->product->api_user = product->api_user;
7440       PetscCall(MatProductSetFromOptions(mp[cp]));
7441       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7442       mptmp[cp] = PETSC_TRUE;
7443       cp++;
7444       PetscCall(MatProductCreate(mmdata->Bloc, mp[1], NULL, &mp[cp]));
7445       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7446       PetscCall(MatProductSetFill(mp[cp], product->fill));
7447       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7448       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7449       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7450       mp[cp]->product->api_user = product->api_user;
7451       PetscCall(MatProductSetFromOptions(mp[cp]));
7452       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7453       rmapt[cp] = 2;
7454       rmapa[cp] = globidx;
7455       cmapt[cp] = 2;
7456       cmapa[cp] = P_oth_idx;
7457       mptmp[cp] = PETSC_FALSE;
7458       cp++;
7459     }
7460     break;
7461   default:
7462     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
7463   }
7464   /* sanity check */
7465   if (size > 1)
7466     for (i = 0; i < cp; i++) PetscCheck(rmapt[i] != 2 || hasoffproc, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected offproc map type for product %" PetscInt_FMT, i);
7467 
7468   PetscCall(PetscMalloc2(cp, &mmdata->mp, cp, &mmdata->mptmp));
7469   for (i = 0; i < cp; i++) {
7470     mmdata->mp[i]    = mp[i];
7471     mmdata->mptmp[i] = mptmp[i];
7472   }
7473   mmdata->cp             = cp;
7474   C->product->data       = mmdata;
7475   C->product->destroy    = MatDestroy_MatMatMPIAIJBACKEND;
7476   C->ops->productnumeric = MatProductNumeric_MPIAIJBACKEND;
7477 
7478   /* memory type */
7479   mmdata->mtype = PETSC_MEMTYPE_HOST;
7480   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iscuda, MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, ""));
7481   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iship, MATSEQAIJHIPSPARSE, MATMPIAIJHIPSPARSE, ""));
7482   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iskokk, MATSEQAIJKOKKOS, MATMPIAIJKOKKOS, ""));
7483   if (iscuda) mmdata->mtype = PETSC_MEMTYPE_CUDA;
7484   else if (iship) mmdata->mtype = PETSC_MEMTYPE_HIP;
7485   else if (iskokk) mmdata->mtype = PETSC_MEMTYPE_KOKKOS;
7486 
7487   /* prepare coo coordinates for values insertion */
7488 
7489   /* count total nonzeros of those intermediate seqaij Mats
7490     ncoo_d:    # of nonzeros of matrices that do not have offproc entries
7491     ncoo_o:    # of nonzeros (of matrices that might have offproc entries) that will be inserted to remote procs
7492     ncoo_oown: # of nonzeros (of matrices that might have offproc entries) that will be inserted locally
7493   */
7494   for (cp = 0, ncoo_d = 0, ncoo_o = 0, ncoo_oown = 0; cp < mmdata->cp; cp++) {
7495     Mat_SeqAIJ *mm = (Mat_SeqAIJ *)mp[cp]->data;
7496     if (mptmp[cp]) continue;
7497     if (rmapt[cp] == 2 && hasoffproc) { /* the rows need to be scatter to all processes (might include self) */
7498       const PetscInt *rmap = rmapa[cp];
7499       const PetscInt  mr   = mp[cp]->rmap->n;
7500       const PetscInt  rs   = C->rmap->rstart;
7501       const PetscInt  re   = C->rmap->rend;
7502       const PetscInt *ii   = mm->i;
7503       for (i = 0; i < mr; i++) {
7504         const PetscInt gr = rmap[i];
7505         const PetscInt nz = ii[i + 1] - ii[i];
7506         if (gr < rs || gr >= re) ncoo_o += nz; /* this row is offproc */
7507         else ncoo_oown += nz;                  /* this row is local */
7508       }
7509     } else ncoo_d += mm->nz;
7510   }
7511 
7512   /*
7513     ncoo: total number of nonzeros (including those inserted by remote procs) belonging to this proc
7514 
7515     ncoo = ncoo_d + ncoo_oown + ncoo2, which ncoo2 is number of nonzeros inserted to me by other procs.
7516 
7517     off[0] points to a big index array, which is shared by off[1,2,...]. Similarly, for own[0].
7518 
7519     off[p]: points to the segment for matrix mp[p], storing location of nonzeros that mp[p] will insert to others
7520     own[p]: points to the segment for matrix mp[p], storing location of nonzeros that mp[p] will insert locally
7521     so, off[p+1]-off[p] is the number of nonzeros that mp[p] will send to others.
7522 
7523     coo_i/j/v[]: [ncoo] row/col/val of nonzeros belonging to this proc.
7524     Ex. coo_i[]: the beginning part (of size ncoo_d + ncoo_oown) stores i of local nonzeros, and the remaining part stores i of nonzeros I will receive.
7525   */
7526   PetscCall(PetscCalloc1(mmdata->cp + 1, &mmdata->off)); /* +1 to make a csr-like data structure */
7527   PetscCall(PetscCalloc1(mmdata->cp + 1, &mmdata->own));
7528 
7529   /* gather (i,j) of nonzeros inserted by remote procs */
7530   if (hasoffproc) {
7531     PetscSF  msf;
7532     PetscInt ncoo2, *coo_i2, *coo_j2;
7533 
7534     PetscCall(PetscMalloc1(ncoo_o, &mmdata->off[0]));
7535     PetscCall(PetscMalloc1(ncoo_oown, &mmdata->own[0]));
7536     PetscCall(PetscMalloc2(ncoo_o, &coo_i, ncoo_o, &coo_j)); /* to collect (i,j) of entries to be sent to others */
7537 
7538     for (cp = 0, ncoo_o = 0; cp < mmdata->cp; cp++) {
7539       Mat_SeqAIJ *mm     = (Mat_SeqAIJ *)mp[cp]->data;
7540       PetscInt   *idxoff = mmdata->off[cp];
7541       PetscInt   *idxown = mmdata->own[cp];
7542       if (!mptmp[cp] && rmapt[cp] == 2) { /* row map is sparse */
7543         const PetscInt *rmap = rmapa[cp];
7544         const PetscInt *cmap = cmapa[cp];
7545         const PetscInt *ii   = mm->i;
7546         PetscInt       *coi  = coo_i + ncoo_o;
7547         PetscInt       *coj  = coo_j + ncoo_o;
7548         const PetscInt  mr   = mp[cp]->rmap->n;
7549         const PetscInt  rs   = C->rmap->rstart;
7550         const PetscInt  re   = C->rmap->rend;
7551         const PetscInt  cs   = C->cmap->rstart;
7552         for (i = 0; i < mr; i++) {
7553           const PetscInt *jj = mm->j + ii[i];
7554           const PetscInt  gr = rmap[i];
7555           const PetscInt  nz = ii[i + 1] - ii[i];
7556           if (gr < rs || gr >= re) { /* this is an offproc row */
7557             for (j = ii[i]; j < ii[i + 1]; j++) {
7558               *coi++    = gr;
7559               *idxoff++ = j;
7560             }
7561             if (!cmapt[cp]) { /* already global */
7562               for (j = 0; j < nz; j++) *coj++ = jj[j];
7563             } else if (cmapt[cp] == 1) { /* local to global for owned columns of C */
7564               for (j = 0; j < nz; j++) *coj++ = jj[j] + cs;
7565             } else { /* offdiag */
7566               for (j = 0; j < nz; j++) *coj++ = cmap[jj[j]];
7567             }
7568             ncoo_o += nz;
7569           } else { /* this is a local row */
7570             for (j = ii[i]; j < ii[i + 1]; j++) *idxown++ = j;
7571           }
7572         }
7573       }
7574       mmdata->off[cp + 1] = idxoff;
7575       mmdata->own[cp + 1] = idxown;
7576     }
7577 
7578     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)C), &mmdata->sf));
7579     PetscInt incoo_o;
7580     PetscCall(PetscIntCast(ncoo_o, &incoo_o));
7581     PetscCall(PetscSFSetGraphLayout(mmdata->sf, C->rmap, incoo_o /*nleaves*/, NULL /*ilocal*/, PETSC_OWN_POINTER, coo_i));
7582     PetscCall(PetscSFGetMultiSF(mmdata->sf, &msf));
7583     PetscCall(PetscSFGetGraph(msf, &ncoo2 /*nroots*/, NULL, NULL, NULL));
7584     ncoo = ncoo_d + ncoo_oown + ncoo2;
7585     PetscCall(PetscMalloc2(ncoo, &coo_i2, ncoo, &coo_j2));
7586     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_INT, coo_i, coo_i2 + ncoo_d + ncoo_oown)); /* put (i,j) of remote nonzeros at back */
7587     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_INT, coo_i, coo_i2 + ncoo_d + ncoo_oown));
7588     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_INT, coo_j, coo_j2 + ncoo_d + ncoo_oown));
7589     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_INT, coo_j, coo_j2 + ncoo_d + ncoo_oown));
7590     PetscCall(PetscFree2(coo_i, coo_j));
7591     /* allocate MPI send buffer to collect nonzero values to be sent to remote procs */
7592     PetscCall(PetscSFMalloc(mmdata->sf, mmdata->mtype, ncoo_o * sizeof(PetscScalar), (void **)&mmdata->coo_w));
7593     coo_i = coo_i2;
7594     coo_j = coo_j2;
7595   } else { /* no offproc values insertion */
7596     ncoo = ncoo_d;
7597     PetscCall(PetscMalloc2(ncoo, &coo_i, ncoo, &coo_j));
7598 
7599     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)C), &mmdata->sf));
7600     PetscCall(PetscSFSetGraph(mmdata->sf, 0, 0, NULL, PETSC_OWN_POINTER, NULL, PETSC_OWN_POINTER));
7601     PetscCall(PetscSFSetUp(mmdata->sf));
7602   }
7603   mmdata->hasoffproc = hasoffproc;
7604 
7605   /* gather (i,j) of nonzeros inserted locally */
7606   for (cp = 0, ncoo_d = 0; cp < mmdata->cp; cp++) {
7607     Mat_SeqAIJ     *mm   = (Mat_SeqAIJ *)mp[cp]->data;
7608     PetscInt       *coi  = coo_i + ncoo_d;
7609     PetscInt       *coj  = coo_j + ncoo_d;
7610     const PetscInt *jj   = mm->j;
7611     const PetscInt *ii   = mm->i;
7612     const PetscInt *cmap = cmapa[cp];
7613     const PetscInt *rmap = rmapa[cp];
7614     const PetscInt  mr   = mp[cp]->rmap->n;
7615     const PetscInt  rs   = C->rmap->rstart;
7616     const PetscInt  re   = C->rmap->rend;
7617     const PetscInt  cs   = C->cmap->rstart;
7618 
7619     if (mptmp[cp]) continue;
7620     if (rmapt[cp] == 1) { /* consecutive rows */
7621       /* fill coo_i */
7622       for (i = 0; i < mr; i++) {
7623         const PetscInt gr = i + rs;
7624         for (j = ii[i]; j < ii[i + 1]; j++) coi[j] = gr;
7625       }
7626       /* fill coo_j */
7627       if (!cmapt[cp]) { /* type-0, already global */
7628         PetscCall(PetscArraycpy(coj, jj, mm->nz));
7629       } else if (cmapt[cp] == 1) {                        /* type-1, local to global for consecutive columns of C */
7630         for (j = 0; j < mm->nz; j++) coj[j] = jj[j] + cs; /* lid + col start */
7631       } else {                                            /* type-2, local to global for sparse columns */
7632         for (j = 0; j < mm->nz; j++) coj[j] = cmap[jj[j]];
7633       }
7634       ncoo_d += mm->nz;
7635     } else if (rmapt[cp] == 2) { /* sparse rows */
7636       for (i = 0; i < mr; i++) {
7637         const PetscInt *jj = mm->j + ii[i];
7638         const PetscInt  gr = rmap[i];
7639         const PetscInt  nz = ii[i + 1] - ii[i];
7640         if (gr >= rs && gr < re) { /* local rows */
7641           for (j = ii[i]; j < ii[i + 1]; j++) *coi++ = gr;
7642           if (!cmapt[cp]) { /* type-0, already global */
7643             for (j = 0; j < nz; j++) *coj++ = jj[j];
7644           } else if (cmapt[cp] == 1) { /* local to global for owned columns of C */
7645             for (j = 0; j < nz; j++) *coj++ = jj[j] + cs;
7646           } else { /* type-2, local to global for sparse columns */
7647             for (j = 0; j < nz; j++) *coj++ = cmap[jj[j]];
7648           }
7649           ncoo_d += nz;
7650         }
7651       }
7652     }
7653   }
7654   if (glob) PetscCall(ISRestoreIndices(glob, &globidx));
7655   PetscCall(ISDestroy(&glob));
7656   if (P_oth_l2g) PetscCall(ISLocalToGlobalMappingRestoreIndices(P_oth_l2g, &P_oth_idx));
7657   PetscCall(ISLocalToGlobalMappingDestroy(&P_oth_l2g));
7658   /* allocate an array to store all nonzeros (inserted locally or remotely) belonging to this proc */
7659   PetscCall(PetscSFMalloc(mmdata->sf, mmdata->mtype, ncoo * sizeof(PetscScalar), (void **)&mmdata->coo_v));
7660 
7661   /* preallocate with COO data */
7662   PetscCall(MatSetPreallocationCOO(C, ncoo, coo_i, coo_j));
7663   PetscCall(PetscFree2(coo_i, coo_j));
7664   PetscFunctionReturn(PETSC_SUCCESS);
7665 }
7666 
7667 PetscErrorCode MatProductSetFromOptions_MPIAIJBACKEND(Mat mat)
7668 {
7669   Mat_Product *product = mat->product;
7670 #if defined(PETSC_HAVE_DEVICE)
7671   PetscBool match  = PETSC_FALSE;
7672   PetscBool usecpu = PETSC_FALSE;
7673 #else
7674   PetscBool match = PETSC_TRUE;
7675 #endif
7676 
7677   PetscFunctionBegin;
7678   MatCheckProduct(mat, 1);
7679 #if defined(PETSC_HAVE_DEVICE)
7680   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, ((PetscObject)product->A)->type_name, &match));
7681   if (match) { /* we can always fallback to the CPU if requested */
7682     switch (product->type) {
7683     case MATPRODUCT_AB:
7684       if (product->api_user) {
7685         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
7686         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
7687         PetscOptionsEnd();
7688       } else {
7689         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
7690         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
7691         PetscOptionsEnd();
7692       }
7693       break;
7694     case MATPRODUCT_AtB:
7695       if (product->api_user) {
7696         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
7697         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
7698         PetscOptionsEnd();
7699       } else {
7700         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
7701         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
7702         PetscOptionsEnd();
7703       }
7704       break;
7705     case MATPRODUCT_PtAP:
7706       if (product->api_user) {
7707         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
7708         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
7709         PetscOptionsEnd();
7710       } else {
7711         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
7712         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
7713         PetscOptionsEnd();
7714       }
7715       break;
7716     default:
7717       break;
7718     }
7719     match = (PetscBool)!usecpu;
7720   }
7721 #endif
7722   if (match) {
7723     switch (product->type) {
7724     case MATPRODUCT_AB:
7725     case MATPRODUCT_AtB:
7726     case MATPRODUCT_PtAP:
7727       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJBACKEND;
7728       break;
7729     default:
7730       break;
7731     }
7732   }
7733   /* fallback to MPIAIJ ops */
7734   if (!mat->ops->productsymbolic) PetscCall(MatProductSetFromOptions_MPIAIJ(mat));
7735   PetscFunctionReturn(PETSC_SUCCESS);
7736 }
7737 
7738 /*
7739    Produces a set of block column indices of the matrix row, one for each block represented in the original row
7740 
7741    n - the number of block indices in cc[]
7742    cc - the block indices (must be large enough to contain the indices)
7743 */
7744 static inline PetscErrorCode MatCollapseRow(Mat Amat, PetscInt row, PetscInt bs, PetscInt *n, PetscInt *cc)
7745 {
7746   PetscInt        cnt = -1, nidx, j;
7747   const PetscInt *idx;
7748 
7749   PetscFunctionBegin;
7750   PetscCall(MatGetRow(Amat, row, &nidx, &idx, NULL));
7751   if (nidx) {
7752     cnt     = 0;
7753     cc[cnt] = idx[0] / bs;
7754     for (j = 1; j < nidx; j++) {
7755       if (cc[cnt] < idx[j] / bs) cc[++cnt] = idx[j] / bs;
7756     }
7757   }
7758   PetscCall(MatRestoreRow(Amat, row, &nidx, &idx, NULL));
7759   *n = cnt + 1;
7760   PetscFunctionReturn(PETSC_SUCCESS);
7761 }
7762 
7763 /*
7764     Produces a set of block column indices of the matrix block row, one for each block represented in the original set of rows
7765 
7766     ncollapsed - the number of block indices
7767     collapsed - the block indices (must be large enough to contain the indices)
7768 */
7769 static inline PetscErrorCode MatCollapseRows(Mat Amat, PetscInt start, PetscInt bs, PetscInt *w0, PetscInt *w1, PetscInt *w2, PetscInt *ncollapsed, PetscInt **collapsed)
7770 {
7771   PetscInt i, nprev, *cprev = w0, ncur = 0, *ccur = w1, *merged = w2, *cprevtmp;
7772 
7773   PetscFunctionBegin;
7774   PetscCall(MatCollapseRow(Amat, start, bs, &nprev, cprev));
7775   for (i = start + 1; i < start + bs; i++) {
7776     PetscCall(MatCollapseRow(Amat, i, bs, &ncur, ccur));
7777     PetscCall(PetscMergeIntArray(nprev, cprev, ncur, ccur, &nprev, &merged));
7778     cprevtmp = cprev;
7779     cprev    = merged;
7780     merged   = cprevtmp;
7781   }
7782   *ncollapsed = nprev;
7783   if (collapsed) *collapsed = cprev;
7784   PetscFunctionReturn(PETSC_SUCCESS);
7785 }
7786 
7787 /*
7788  MatCreateGraph_Simple_AIJ - create simple scalar matrix (graph) from potentially blocked matrix
7789 
7790  Input Parameter:
7791  . Amat - matrix
7792  - symmetrize - make the result symmetric
7793  + scale - scale with diagonal
7794 
7795  Output Parameter:
7796  . a_Gmat - output scalar graph >= 0
7797 
7798 */
7799 PETSC_INTERN PetscErrorCode MatCreateGraph_Simple_AIJ(Mat Amat, PetscBool symmetrize, PetscBool scale, PetscReal filter, PetscInt index_size, PetscInt index[], Mat *a_Gmat)
7800 {
7801   PetscInt  Istart, Iend, Ii, jj, kk, ncols, nloc, NN, MM, bs;
7802   MPI_Comm  comm;
7803   Mat       Gmat;
7804   PetscBool ismpiaij, isseqaij;
7805   Mat       a, b, c;
7806   MatType   jtype;
7807 
7808   PetscFunctionBegin;
7809   PetscCall(PetscObjectGetComm((PetscObject)Amat, &comm));
7810   PetscCall(MatGetOwnershipRange(Amat, &Istart, &Iend));
7811   PetscCall(MatGetSize(Amat, &MM, &NN));
7812   PetscCall(MatGetBlockSize(Amat, &bs));
7813   nloc = (Iend - Istart) / bs;
7814 
7815   PetscCall(PetscObjectBaseTypeCompare((PetscObject)Amat, MATSEQAIJ, &isseqaij));
7816   PetscCall(PetscObjectBaseTypeCompare((PetscObject)Amat, MATMPIAIJ, &ismpiaij));
7817   PetscCheck(isseqaij || ismpiaij, comm, PETSC_ERR_USER, "Require (MPI)AIJ matrix type");
7818 
7819   /* TODO GPU: these calls are potentially expensive if matrices are large and we want to use the GPU */
7820   /* A solution consists in providing a new API, MatAIJGetCollapsedAIJ, and each class can provide a fast
7821      implementation */
7822   if (bs > 1) {
7823     PetscCall(MatGetType(Amat, &jtype));
7824     PetscCall(MatCreate(comm, &Gmat));
7825     PetscCall(MatSetType(Gmat, jtype));
7826     PetscCall(MatSetSizes(Gmat, nloc, nloc, PETSC_DETERMINE, PETSC_DETERMINE));
7827     PetscCall(MatSetBlockSizes(Gmat, 1, 1));
7828     if (isseqaij || ((Mat_MPIAIJ *)Amat->data)->garray) {
7829       PetscInt  *d_nnz, *o_nnz;
7830       MatScalar *aa, val, *AA;
7831       PetscInt  *aj, *ai, *AJ, nc, nmax = 0;
7832 
7833       if (isseqaij) {
7834         a = Amat;
7835         b = NULL;
7836       } else {
7837         Mat_MPIAIJ *d = (Mat_MPIAIJ *)Amat->data;
7838         a             = d->A;
7839         b             = d->B;
7840       }
7841       PetscCall(PetscInfo(Amat, "New bs>1 Graph. nloc=%" PetscInt_FMT "\n", nloc));
7842       PetscCall(PetscMalloc2(nloc, &d_nnz, (isseqaij ? 0 : nloc), &o_nnz));
7843       for (c = a, kk = 0; c && kk < 2; c = b, kk++) {
7844         PetscInt       *nnz = (c == a) ? d_nnz : o_nnz;
7845         const PetscInt *cols1, *cols2;
7846 
7847         for (PetscInt brow = 0, nc1, nc2, ok = 1; brow < nloc * bs; brow += bs) { // block rows
7848           PetscCall(MatGetRow(c, brow, &nc2, &cols2, NULL));
7849           nnz[brow / bs] = nc2 / bs;
7850           if (nc2 % bs) ok = 0;
7851           if (nnz[brow / bs] > nmax) nmax = nnz[brow / bs];
7852           for (PetscInt ii = 1; ii < bs; ii++) { // check for non-dense blocks
7853             PetscCall(MatGetRow(c, brow + ii, &nc1, &cols1, NULL));
7854             if (nc1 != nc2) ok = 0;
7855             else {
7856               for (PetscInt jj = 0; jj < nc1 && ok == 1; jj++) {
7857                 if (cols1[jj] != cols2[jj]) ok = 0;
7858                 if (cols1[jj] % bs != jj % bs) ok = 0;
7859               }
7860             }
7861             PetscCall(MatRestoreRow(c, brow + ii, &nc1, &cols1, NULL));
7862           }
7863           PetscCall(MatRestoreRow(c, brow, &nc2, &cols2, NULL));
7864           if (!ok) {
7865             PetscCall(PetscFree2(d_nnz, o_nnz));
7866             PetscCall(PetscInfo(Amat, "Found sparse blocks - revert to slow method\n"));
7867             goto old_bs;
7868           }
7869         }
7870       }
7871       PetscCall(MatSeqAIJSetPreallocation(Gmat, 0, d_nnz));
7872       PetscCall(MatMPIAIJSetPreallocation(Gmat, 0, d_nnz, 0, o_nnz));
7873       PetscCall(PetscFree2(d_nnz, o_nnz));
7874       PetscCall(PetscMalloc2(nmax, &AA, nmax, &AJ));
7875       // diag
7876       for (PetscInt brow = 0, n, grow; brow < nloc * bs; brow += bs) { // block rows
7877         Mat_SeqAIJ *aseq = (Mat_SeqAIJ *)a->data;
7878 
7879         ai = aseq->i;
7880         n  = ai[brow + 1] - ai[brow];
7881         aj = aseq->j + ai[brow];
7882         for (PetscInt k = 0; k < n; k += bs) {   // block columns
7883           AJ[k / bs] = aj[k] / bs + Istart / bs; // diag starts at (Istart,Istart)
7884           val        = 0;
7885           if (index_size == 0) {
7886             for (PetscInt ii = 0; ii < bs; ii++) { // rows in block
7887               aa = aseq->a + ai[brow + ii] + k;
7888               for (PetscInt jj = 0; jj < bs; jj++) {    // columns in block
7889                 val += PetscAbs(PetscRealPart(aa[jj])); // a sort of norm
7890               }
7891             }
7892           } else {                                            // use (index,index) value if provided
7893             for (PetscInt iii = 0; iii < index_size; iii++) { // rows in block
7894               PetscInt ii = index[iii];
7895               aa          = aseq->a + ai[brow + ii] + k;
7896               for (PetscInt jjj = 0; jjj < index_size; jjj++) { // columns in block
7897                 PetscInt jj = index[jjj];
7898                 val += PetscAbs(PetscRealPart(aa[jj]));
7899               }
7900             }
7901           }
7902           PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs (%" PetscInt_FMT ") >= nmax (%" PetscInt_FMT ")", k / bs, nmax);
7903           AA[k / bs] = val;
7904         }
7905         grow = Istart / bs + brow / bs;
7906         PetscCall(MatSetValues(Gmat, 1, &grow, n / bs, AJ, AA, ADD_VALUES));
7907       }
7908       // off-diag
7909       if (ismpiaij) {
7910         Mat_MPIAIJ        *aij = (Mat_MPIAIJ *)Amat->data;
7911         const PetscScalar *vals;
7912         const PetscInt    *cols, *garray = aij->garray;
7913 
7914         PetscCheck(garray, PETSC_COMM_SELF, PETSC_ERR_USER, "No garray ?");
7915         for (PetscInt brow = 0, grow; brow < nloc * bs; brow += bs) { // block rows
7916           PetscCall(MatGetRow(b, brow, &ncols, &cols, NULL));
7917           for (PetscInt k = 0, cidx = 0; k < ncols; k += bs, cidx++) {
7918             PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs >= nmax");
7919             AA[k / bs] = 0;
7920             AJ[cidx]   = garray[cols[k]] / bs;
7921           }
7922           nc = ncols / bs;
7923           PetscCall(MatRestoreRow(b, brow, &ncols, &cols, NULL));
7924           if (index_size == 0) {
7925             for (PetscInt ii = 0; ii < bs; ii++) { // rows in block
7926               PetscCall(MatGetRow(b, brow + ii, &ncols, &cols, &vals));
7927               for (PetscInt k = 0; k < ncols; k += bs) {
7928                 for (PetscInt jj = 0; jj < bs; jj++) { // cols in block
7929                   PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs (%" PetscInt_FMT ") >= nmax (%" PetscInt_FMT ")", k / bs, nmax);
7930                   AA[k / bs] += PetscAbs(PetscRealPart(vals[k + jj]));
7931                 }
7932               }
7933               PetscCall(MatRestoreRow(b, brow + ii, &ncols, &cols, &vals));
7934             }
7935           } else {                                            // use (index,index) value if provided
7936             for (PetscInt iii = 0; iii < index_size; iii++) { // rows in block
7937               PetscInt ii = index[iii];
7938               PetscCall(MatGetRow(b, brow + ii, &ncols, &cols, &vals));
7939               for (PetscInt k = 0; k < ncols; k += bs) {
7940                 for (PetscInt jjj = 0; jjj < index_size; jjj++) { // cols in block
7941                   PetscInt jj = index[jjj];
7942                   AA[k / bs] += PetscAbs(PetscRealPart(vals[k + jj]));
7943                 }
7944               }
7945               PetscCall(MatRestoreRow(b, brow + ii, &ncols, &cols, &vals));
7946             }
7947           }
7948           grow = Istart / bs + brow / bs;
7949           PetscCall(MatSetValues(Gmat, 1, &grow, nc, AJ, AA, ADD_VALUES));
7950         }
7951       }
7952       PetscCall(MatAssemblyBegin(Gmat, MAT_FINAL_ASSEMBLY));
7953       PetscCall(MatAssemblyEnd(Gmat, MAT_FINAL_ASSEMBLY));
7954       PetscCall(PetscFree2(AA, AJ));
7955     } else {
7956       const PetscScalar *vals;
7957       const PetscInt    *idx;
7958       PetscInt          *d_nnz, *o_nnz, *w0, *w1, *w2;
7959     old_bs:
7960       /*
7961        Determine the preallocation needed for the scalar matrix derived from the vector matrix.
7962        */
7963       PetscCall(PetscInfo(Amat, "OLD bs>1 CreateGraph\n"));
7964       PetscCall(PetscMalloc2(nloc, &d_nnz, (isseqaij ? 0 : nloc), &o_nnz));
7965       if (isseqaij) {
7966         PetscInt max_d_nnz;
7967 
7968         /*
7969          Determine exact preallocation count for (sequential) scalar matrix
7970          */
7971         PetscCall(MatSeqAIJGetMaxRowNonzeros(Amat, &max_d_nnz));
7972         max_d_nnz = PetscMin(nloc, bs * max_d_nnz);
7973         PetscCall(PetscMalloc3(max_d_nnz, &w0, max_d_nnz, &w1, max_d_nnz, &w2));
7974         for (Ii = 0, jj = 0; Ii < Iend; Ii += bs, jj++) PetscCall(MatCollapseRows(Amat, Ii, bs, w0, w1, w2, &d_nnz[jj], NULL));
7975         PetscCall(PetscFree3(w0, w1, w2));
7976       } else if (ismpiaij) {
7977         Mat             Daij, Oaij;
7978         const PetscInt *garray;
7979         PetscInt        max_d_nnz;
7980 
7981         PetscCall(MatMPIAIJGetSeqAIJ(Amat, &Daij, &Oaij, &garray));
7982         /*
7983          Determine exact preallocation count for diagonal block portion of scalar matrix
7984          */
7985         PetscCall(MatSeqAIJGetMaxRowNonzeros(Daij, &max_d_nnz));
7986         max_d_nnz = PetscMin(nloc, bs * max_d_nnz);
7987         PetscCall(PetscMalloc3(max_d_nnz, &w0, max_d_nnz, &w1, max_d_nnz, &w2));
7988         for (Ii = 0, jj = 0; Ii < Iend - Istart; Ii += bs, jj++) PetscCall(MatCollapseRows(Daij, Ii, bs, w0, w1, w2, &d_nnz[jj], NULL));
7989         PetscCall(PetscFree3(w0, w1, w2));
7990         /*
7991          Over estimate (usually grossly over), preallocation count for off-diagonal portion of scalar matrix
7992          */
7993         for (Ii = 0, jj = 0; Ii < Iend - Istart; Ii += bs, jj++) {
7994           o_nnz[jj] = 0;
7995           for (kk = 0; kk < bs; kk++) { /* rows that get collapsed to a single row */
7996             PetscCall(MatGetRow(Oaij, Ii + kk, &ncols, NULL, NULL));
7997             o_nnz[jj] += ncols;
7998             PetscCall(MatRestoreRow(Oaij, Ii + kk, &ncols, NULL, NULL));
7999           }
8000           if (o_nnz[jj] > (NN / bs - nloc)) o_nnz[jj] = NN / bs - nloc;
8001         }
8002       } else SETERRQ(comm, PETSC_ERR_USER, "Require AIJ matrix type");
8003       /* get scalar copy (norms) of matrix */
8004       PetscCall(MatSeqAIJSetPreallocation(Gmat, 0, d_nnz));
8005       PetscCall(MatMPIAIJSetPreallocation(Gmat, 0, d_nnz, 0, o_nnz));
8006       PetscCall(PetscFree2(d_nnz, o_nnz));
8007       for (Ii = Istart; Ii < Iend; Ii++) {
8008         PetscInt dest_row = Ii / bs;
8009 
8010         PetscCall(MatGetRow(Amat, Ii, &ncols, &idx, &vals));
8011         for (jj = 0; jj < ncols; jj++) {
8012           PetscInt    dest_col = idx[jj] / bs;
8013           PetscScalar sv       = PetscAbs(PetscRealPart(vals[jj]));
8014 
8015           PetscCall(MatSetValues(Gmat, 1, &dest_row, 1, &dest_col, &sv, ADD_VALUES));
8016         }
8017         PetscCall(MatRestoreRow(Amat, Ii, &ncols, &idx, &vals));
8018       }
8019       PetscCall(MatAssemblyBegin(Gmat, MAT_FINAL_ASSEMBLY));
8020       PetscCall(MatAssemblyEnd(Gmat, MAT_FINAL_ASSEMBLY));
8021     }
8022   } else {
8023     if (symmetrize || filter >= 0 || scale) PetscCall(MatDuplicate(Amat, MAT_COPY_VALUES, &Gmat));
8024     else {
8025       Gmat = Amat;
8026       PetscCall(PetscObjectReference((PetscObject)Gmat));
8027     }
8028     if (isseqaij) {
8029       a = Gmat;
8030       b = NULL;
8031     } else {
8032       Mat_MPIAIJ *d = (Mat_MPIAIJ *)Gmat->data;
8033       a             = d->A;
8034       b             = d->B;
8035     }
8036     if (filter >= 0 || scale) {
8037       /* take absolute value of each entry */
8038       for (c = a, kk = 0; c && kk < 2; c = b, kk++) {
8039         MatInfo      info;
8040         PetscScalar *avals;
8041 
8042         PetscCall(MatGetInfo(c, MAT_LOCAL, &info));
8043         PetscCall(MatSeqAIJGetArray(c, &avals));
8044         for (int jj = 0; jj < info.nz_used; jj++) avals[jj] = PetscAbsScalar(avals[jj]);
8045         PetscCall(MatSeqAIJRestoreArray(c, &avals));
8046       }
8047     }
8048   }
8049   if (symmetrize) {
8050     PetscBool isset, issym;
8051 
8052     PetscCall(MatIsSymmetricKnown(Amat, &isset, &issym));
8053     if (!isset || !issym) {
8054       Mat matTrans;
8055 
8056       PetscCall(MatTranspose(Gmat, MAT_INITIAL_MATRIX, &matTrans));
8057       PetscCall(MatAXPY(Gmat, 1.0, matTrans, Gmat->structurally_symmetric == PETSC_BOOL3_TRUE ? SAME_NONZERO_PATTERN : DIFFERENT_NONZERO_PATTERN));
8058       PetscCall(MatDestroy(&matTrans));
8059     }
8060     PetscCall(MatSetOption(Gmat, MAT_SYMMETRIC, PETSC_TRUE));
8061   } else if (Amat != Gmat) PetscCall(MatPropagateSymmetryOptions(Amat, Gmat));
8062   if (scale) {
8063     /* scale c for all diagonal values = 1 or -1 */
8064     Vec diag;
8065 
8066     PetscCall(MatCreateVecs(Gmat, &diag, NULL));
8067     PetscCall(MatGetDiagonal(Gmat, diag));
8068     PetscCall(VecReciprocal(diag));
8069     PetscCall(VecSqrtAbs(diag));
8070     PetscCall(MatDiagonalScale(Gmat, diag, diag));
8071     PetscCall(VecDestroy(&diag));
8072   }
8073   PetscCall(MatViewFromOptions(Gmat, NULL, "-mat_graph_view"));
8074   if (filter >= 0) {
8075     PetscCall(MatFilter(Gmat, filter, PETSC_TRUE, PETSC_TRUE));
8076     PetscCall(MatViewFromOptions(Gmat, NULL, "-mat_filter_graph_view"));
8077   }
8078   *a_Gmat = Gmat;
8079   PetscFunctionReturn(PETSC_SUCCESS);
8080 }
8081 
8082 /*
8083     Special version for direct calls from Fortran
8084 */
8085 
8086 /* Change these macros so can be used in void function */
8087 /* Identical to PetscCallVoid, except it assigns to *_ierr */
8088 #undef PetscCall
8089 #define PetscCall(...) \
8090   do { \
8091     PetscErrorCode ierr_msv_mpiaij = __VA_ARGS__; \
8092     if (PetscUnlikely(ierr_msv_mpiaij)) { \
8093       *_ierr = PetscError(PETSC_COMM_SELF, __LINE__, PETSC_FUNCTION_NAME, __FILE__, ierr_msv_mpiaij, PETSC_ERROR_REPEAT, " "); \
8094       return; \
8095     } \
8096   } while (0)
8097 
8098 #undef SETERRQ
8099 #define SETERRQ(comm, ierr, ...) \
8100   do { \
8101     *_ierr = PetscError(comm, __LINE__, PETSC_FUNCTION_NAME, __FILE__, ierr, PETSC_ERROR_INITIAL, __VA_ARGS__); \
8102     return; \
8103   } while (0)
8104 
8105 #if defined(PETSC_HAVE_FORTRAN_CAPS)
8106   #define matsetvaluesmpiaij_ MATSETVALUESMPIAIJ
8107 #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
8108   #define matsetvaluesmpiaij_ matsetvaluesmpiaij
8109 #else
8110 #endif
8111 PETSC_EXTERN void matsetvaluesmpiaij_(Mat *mmat, PetscInt *mm, const PetscInt im[], PetscInt *mn, const PetscInt in[], const PetscScalar v[], InsertMode *maddv, PetscErrorCode *_ierr)
8112 {
8113   Mat         mat = *mmat;
8114   PetscInt    m = *mm, n = *mn;
8115   InsertMode  addv = *maddv;
8116   Mat_MPIAIJ *aij  = (Mat_MPIAIJ *)mat->data;
8117   PetscScalar value;
8118 
8119   MatCheckPreallocated(mat, 1);
8120   if (mat->insertmode == NOT_SET_VALUES) mat->insertmode = addv;
8121   else PetscCheck(mat->insertmode == addv, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Cannot mix add values and insert values");
8122   {
8123     PetscInt  i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
8124     PetscInt  cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
8125     PetscBool roworiented = aij->roworiented;
8126 
8127     /* Some Variables required in the macro */
8128     Mat         A     = aij->A;
8129     Mat_SeqAIJ *a     = (Mat_SeqAIJ *)A->data;
8130     PetscInt   *aimax = a->imax, *ai = a->i, *ailen = a->ilen, *aj = a->j;
8131     MatScalar  *aa;
8132     PetscBool   ignorezeroentries = ((a->ignorezeroentries && (addv == ADD_VALUES)) ? PETSC_TRUE : PETSC_FALSE);
8133     Mat         B                 = aij->B;
8134     Mat_SeqAIJ *b                 = (Mat_SeqAIJ *)B->data;
8135     PetscInt   *bimax = b->imax, *bi = b->i, *bilen = b->ilen, *bj = b->j, bm = aij->B->rmap->n, am = aij->A->rmap->n;
8136     MatScalar  *ba;
8137     /* This variable below is only for the PETSC_HAVE_VIENNACL or PETSC_HAVE_CUDA cases, but we define it in all cases because we
8138      * cannot use "#if defined" inside a macro. */
8139     PETSC_UNUSED PetscBool inserted = PETSC_FALSE;
8140 
8141     PetscInt  *rp1, *rp2, ii, nrow1, nrow2, _i, rmax1, rmax2, N, low1, high1, low2, high2, t, lastcol1, lastcol2;
8142     PetscInt   nonew = a->nonew;
8143     MatScalar *ap1, *ap2;
8144 
8145     PetscFunctionBegin;
8146     PetscCall(MatSeqAIJGetArray(A, &aa));
8147     PetscCall(MatSeqAIJGetArray(B, &ba));
8148     for (i = 0; i < m; i++) {
8149       if (im[i] < 0) continue;
8150       PetscCheck(im[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, im[i], mat->rmap->N - 1);
8151       if (im[i] >= rstart && im[i] < rend) {
8152         row      = im[i] - rstart;
8153         lastcol1 = -1;
8154         rp1      = aj + ai[row];
8155         ap1      = aa + ai[row];
8156         rmax1    = aimax[row];
8157         nrow1    = ailen[row];
8158         low1     = 0;
8159         high1    = nrow1;
8160         lastcol2 = -1;
8161         rp2      = bj + bi[row];
8162         ap2      = ba + bi[row];
8163         rmax2    = bimax[row];
8164         nrow2    = bilen[row];
8165         low2     = 0;
8166         high2    = nrow2;
8167 
8168         for (j = 0; j < n; j++) {
8169           if (roworiented) value = v[i * n + j];
8170           else value = v[i + j * m];
8171           if (ignorezeroentries && value == 0.0 && (addv == ADD_VALUES) && im[i] != in[j]) continue;
8172           if (in[j] >= cstart && in[j] < cend) {
8173             col = in[j] - cstart;
8174             MatSetValues_SeqAIJ_A_Private(row, col, value, addv, im[i], in[j]);
8175           } else if (in[j] < 0) continue;
8176           else if (PetscUnlikelyDebug(in[j] >= mat->cmap->N)) {
8177             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[j], mat->cmap->N - 1);
8178           } else {
8179             if (mat->was_assembled) {
8180               if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
8181 #if defined(PETSC_USE_CTABLE)
8182               PetscCall(PetscHMapIGetWithDefault(aij->colmap, in[j] + 1, 0, &col));
8183               col--;
8184 #else
8185               col = aij->colmap[in[j]] - 1;
8186 #endif
8187               if (col < 0 && !((Mat_SeqAIJ *)aij->A->data)->nonew) {
8188                 PetscCall(MatDisAssemble_MPIAIJ(mat, PETSC_FALSE));
8189                 col = in[j];
8190                 /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
8191                 B        = aij->B;
8192                 b        = (Mat_SeqAIJ *)B->data;
8193                 bimax    = b->imax;
8194                 bi       = b->i;
8195                 bilen    = b->ilen;
8196                 bj       = b->j;
8197                 rp2      = bj + bi[row];
8198                 ap2      = ba + bi[row];
8199                 rmax2    = bimax[row];
8200                 nrow2    = bilen[row];
8201                 low2     = 0;
8202                 high2    = nrow2;
8203                 bm       = aij->B->rmap->n;
8204                 ba       = b->a;
8205                 inserted = PETSC_FALSE;
8206               }
8207             } else col = in[j];
8208             MatSetValues_SeqAIJ_B_Private(row, col, value, addv, im[i], in[j]);
8209           }
8210         }
8211       } else if (!aij->donotstash) {
8212         if (roworiented) {
8213           PetscCall(MatStashValuesRow_Private(&mat->stash, im[i], n, in, v + i * n, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
8214         } else {
8215           PetscCall(MatStashValuesCol_Private(&mat->stash, im[i], n, in, v + i, m, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
8216         }
8217       }
8218     }
8219     PetscCall(MatSeqAIJRestoreArray(A, &aa));
8220     PetscCall(MatSeqAIJRestoreArray(B, &ba));
8221   }
8222   PetscFunctionReturnVoid();
8223 }
8224 
8225 /* Undefining these here since they were redefined from their original definition above! No
8226  * other PETSc functions should be defined past this point, as it is impossible to recover the
8227  * original definitions */
8228 #undef PetscCall
8229 #undef SETERRQ
8230