xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 3f7878b3ed00daeee6ae8d2839a0d41a4d3e79ca)
1 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
2 #include <petsc/private/vecimpl.h>
3 #include <petsc/private/sfimpl.h>
4 #include <petsc/private/isimpl.h>
5 #include <petscblaslapack.h>
6 #include <petscsf.h>
7 #include <petsc/private/hashmapi.h>
8 
9 /* defines MatSetValues_MPI_Hash(), MatAssemblyBegin_MPI_Hash(), and MatAssemblyEnd_MPI_Hash() */
10 #define TYPE AIJ
11 #define TYPE_AIJ
12 #include "../src/mat/impls/aij/mpi/mpihashmat.h"
13 #undef TYPE
14 #undef TYPE_AIJ
15 
16 static PetscErrorCode MatReset_MPIAIJ(Mat mat)
17 {
18   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
19 
20   PetscFunctionBegin;
21   PetscCall(PetscLogObjectState((PetscObject)mat, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT, mat->rmap->N, mat->cmap->N));
22   PetscCall(MatStashDestroy_Private(&mat->stash));
23   PetscCall(VecDestroy(&aij->diag));
24   PetscCall(MatDestroy(&aij->A));
25   PetscCall(MatDestroy(&aij->B));
26 #if defined(PETSC_USE_CTABLE)
27   PetscCall(PetscHMapIDestroy(&aij->colmap));
28 #else
29   PetscCall(PetscFree(aij->colmap));
30 #endif
31   PetscCall(PetscFree(aij->garray));
32   PetscCall(VecDestroy(&aij->lvec));
33   PetscCall(VecScatterDestroy(&aij->Mvctx));
34   PetscCall(PetscFree2(aij->rowvalues, aij->rowindices));
35   PetscCall(PetscFree(aij->ld));
36   PetscFunctionReturn(PETSC_SUCCESS);
37 }
38 
39 static PetscErrorCode MatResetHash_MPIAIJ(Mat mat)
40 {
41   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
42   /* Save the nonzero states of the component matrices because those are what are used to determine
43     the nonzero state of mat */
44   PetscObjectState Astate = aij->A->nonzerostate, Bstate = aij->B->nonzerostate;
45 
46   PetscFunctionBegin;
47   PetscCall(MatReset_MPIAIJ(mat));
48   PetscCall(MatSetUp_MPI_Hash(mat));
49   aij->A->nonzerostate = ++Astate, aij->B->nonzerostate = ++Bstate;
50   PetscFunctionReturn(PETSC_SUCCESS);
51 }
52 
53 PetscErrorCode MatDestroy_MPIAIJ(Mat mat)
54 {
55   PetscFunctionBegin;
56   PetscCall(MatReset_MPIAIJ(mat));
57 
58   PetscCall(PetscFree(mat->data));
59 
60   /* may be created by MatCreateMPIAIJSumSeqAIJSymbolic */
61   PetscCall(PetscObjectCompose((PetscObject)mat, "MatMergeSeqsToMPI", NULL));
62 
63   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, NULL));
64   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatStoreValues_C", NULL));
65   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatRetrieveValues_C", NULL));
66   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatIsTranspose_C", NULL));
67   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetPreallocation_C", NULL));
68   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatResetPreallocation_C", NULL));
69   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatResetHash_C", NULL));
70   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetPreallocationCSR_C", NULL));
71   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDiagonalScaleLocal_C", NULL));
72   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpibaij_C", NULL));
73   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpisbaij_C", NULL));
74 #if defined(PETSC_HAVE_CUDA)
75   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijcusparse_C", NULL));
76 #endif
77 #if defined(PETSC_HAVE_HIP)
78   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijhipsparse_C", NULL));
79 #endif
80 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
81   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijkokkos_C", NULL));
82 #endif
83   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpidense_C", NULL));
84 #if defined(PETSC_HAVE_ELEMENTAL)
85   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_elemental_C", NULL));
86 #endif
87 #if defined(PETSC_HAVE_SCALAPACK)
88   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_scalapack_C", NULL));
89 #endif
90 #if defined(PETSC_HAVE_HYPRE)
91   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_hypre_C", NULL));
92   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_transpose_mpiaij_mpiaij_C", NULL));
93 #endif
94   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_is_C", NULL));
95   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_is_mpiaij_C", NULL));
96   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpiaij_C", NULL));
97   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetUseScalableIncreaseOverlap_C", NULL));
98   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijperm_C", NULL));
99   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijsell_C", NULL));
100 #if defined(PETSC_HAVE_MKL_SPARSE)
101   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijmkl_C", NULL));
102 #endif
103   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijcrl_C", NULL));
104   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_is_C", NULL));
105   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpisell_C", NULL));
106   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatSetPreallocationCOO_C", NULL));
107   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatSetValuesCOO_C", NULL));
108   PetscFunctionReturn(PETSC_SUCCESS);
109 }
110 
111 static PetscErrorCode MatGetRowIJ_MPIAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *m, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
112 {
113   Mat B;
114 
115   PetscFunctionBegin;
116   PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, &B));
117   PetscCall(PetscObjectCompose((PetscObject)A, "MatGetRowIJ_MPIAIJ", (PetscObject)B));
118   PetscCall(MatGetRowIJ(B, oshift, symmetric, inodecompressed, m, ia, ja, done));
119   PetscCall(MatDestroy(&B));
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 static PetscErrorCode MatRestoreRowIJ_MPIAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *m, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
124 {
125   Mat B;
126 
127   PetscFunctionBegin;
128   PetscCall(PetscObjectQuery((PetscObject)A, "MatGetRowIJ_MPIAIJ", (PetscObject *)&B));
129   PetscCall(MatRestoreRowIJ(B, oshift, symmetric, inodecompressed, m, ia, ja, done));
130   PetscCall(PetscObjectCompose((PetscObject)A, "MatGetRowIJ_MPIAIJ", NULL));
131   PetscFunctionReturn(PETSC_SUCCESS);
132 }
133 
134 /*MC
135    MATAIJ - MATAIJ = "aij" - A matrix type to be used for sparse matrices.
136 
137    This matrix type is identical to` MATSEQAIJ` when constructed with a single process communicator,
138    and `MATMPIAIJ` otherwise.  As a result, for single process communicators,
139   `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
140   for communicators controlling multiple processes.  It is recommended that you call both of
141   the above preallocation routines for simplicity.
142 
143    Options Database Key:
144 . -mat_type aij - sets the matrix type to `MATAIJ` during a call to `MatSetFromOptions()`
145 
146   Developer Note:
147   Level: beginner
148 
149     Subclasses include `MATAIJCUSPARSE`, `MATAIJPERM`, `MATAIJSELL`, `MATAIJMKL`, `MATAIJCRL`, `MATAIJKOKKOS`,and also automatically switches over to use inodes when
150    enough exist.
151 
152 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MATSEQAIJ`, `MatCreateAIJ()`, `MatCreateSeqAIJ()`, `MATSEQAIJ`, `MATMPIAIJ`
153 M*/
154 
155 /*MC
156    MATAIJCRL - MATAIJCRL = "aijcrl" - A matrix type to be used for sparse matrices.
157 
158    This matrix type is identical to `MATSEQAIJCRL` when constructed with a single process communicator,
159    and `MATMPIAIJCRL` otherwise.  As a result, for single process communicators,
160    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
161   for communicators controlling multiple processes.  It is recommended that you call both of
162   the above preallocation routines for simplicity.
163 
164    Options Database Key:
165 . -mat_type aijcrl - sets the matrix type to `MATMPIAIJCRL` during a call to `MatSetFromOptions()`
166 
167   Level: beginner
168 
169 .seealso: [](ch_matrices), `Mat`, `MatCreateMPIAIJCRL`, `MATSEQAIJCRL`, `MATMPIAIJCRL`, `MATSEQAIJCRL`, `MATMPIAIJCRL`
170 M*/
171 
172 static PetscErrorCode MatBindToCPU_MPIAIJ(Mat A, PetscBool flg)
173 {
174   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
175 
176   PetscFunctionBegin;
177 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP) || defined(PETSC_HAVE_VIENNACL)
178   A->boundtocpu = flg;
179 #endif
180   if (a->A) PetscCall(MatBindToCPU(a->A, flg));
181   if (a->B) PetscCall(MatBindToCPU(a->B, flg));
182 
183   /* In addition to binding the diagonal and off-diagonal matrices, bind the local vectors used for matrix-vector products.
184    * This maybe seems a little odd for a MatBindToCPU() call to do, but it makes no sense for the binding of these vectors
185    * to differ from the parent matrix. */
186   if (a->lvec) PetscCall(VecBindToCPU(a->lvec, flg));
187   if (a->diag) PetscCall(VecBindToCPU(a->diag, flg));
188   PetscFunctionReturn(PETSC_SUCCESS);
189 }
190 
191 static PetscErrorCode MatSetBlockSizes_MPIAIJ(Mat M, PetscInt rbs, PetscInt cbs)
192 {
193   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)M->data;
194 
195   PetscFunctionBegin;
196   if (mat->A) {
197     PetscCall(MatSetBlockSizes(mat->A, rbs, cbs));
198     PetscCall(MatSetBlockSizes(mat->B, rbs, 1));
199   }
200   PetscFunctionReturn(PETSC_SUCCESS);
201 }
202 
203 static PetscErrorCode MatFindNonzeroRows_MPIAIJ(Mat M, IS *keptrows)
204 {
205   Mat_MPIAIJ      *mat = (Mat_MPIAIJ *)M->data;
206   Mat_SeqAIJ      *a   = (Mat_SeqAIJ *)mat->A->data;
207   Mat_SeqAIJ      *b   = (Mat_SeqAIJ *)mat->B->data;
208   const PetscInt  *ia, *ib;
209   const MatScalar *aa, *bb, *aav, *bav;
210   PetscInt         na, nb, i, j, *rows, cnt = 0, n0rows;
211   PetscInt         m = M->rmap->n, rstart = M->rmap->rstart;
212 
213   PetscFunctionBegin;
214   *keptrows = NULL;
215 
216   ia = a->i;
217   ib = b->i;
218   PetscCall(MatSeqAIJGetArrayRead(mat->A, &aav));
219   PetscCall(MatSeqAIJGetArrayRead(mat->B, &bav));
220   for (i = 0; i < m; i++) {
221     na = ia[i + 1] - ia[i];
222     nb = ib[i + 1] - ib[i];
223     if (!na && !nb) {
224       cnt++;
225       goto ok1;
226     }
227     aa = aav + ia[i];
228     for (j = 0; j < na; j++) {
229       if (aa[j] != 0.0) goto ok1;
230     }
231     bb = PetscSafePointerPlusOffset(bav, ib[i]);
232     for (j = 0; j < nb; j++) {
233       if (bb[j] != 0.0) goto ok1;
234     }
235     cnt++;
236   ok1:;
237   }
238   PetscCallMPI(MPIU_Allreduce(&cnt, &n0rows, 1, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)M)));
239   if (!n0rows) {
240     PetscCall(MatSeqAIJRestoreArrayRead(mat->A, &aav));
241     PetscCall(MatSeqAIJRestoreArrayRead(mat->B, &bav));
242     PetscFunctionReturn(PETSC_SUCCESS);
243   }
244   PetscCall(PetscMalloc1(M->rmap->n - cnt, &rows));
245   cnt = 0;
246   for (i = 0; i < m; i++) {
247     na = ia[i + 1] - ia[i];
248     nb = ib[i + 1] - ib[i];
249     if (!na && !nb) continue;
250     aa = aav + ia[i];
251     for (j = 0; j < na; j++) {
252       if (aa[j] != 0.0) {
253         rows[cnt++] = rstart + i;
254         goto ok2;
255       }
256     }
257     bb = PetscSafePointerPlusOffset(bav, ib[i]);
258     for (j = 0; j < nb; j++) {
259       if (bb[j] != 0.0) {
260         rows[cnt++] = rstart + i;
261         goto ok2;
262       }
263     }
264   ok2:;
265   }
266   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)M), cnt, rows, PETSC_OWN_POINTER, keptrows));
267   PetscCall(MatSeqAIJRestoreArrayRead(mat->A, &aav));
268   PetscCall(MatSeqAIJRestoreArrayRead(mat->B, &bav));
269   PetscFunctionReturn(PETSC_SUCCESS);
270 }
271 
272 static PetscErrorCode MatDiagonalSet_MPIAIJ(Mat Y, Vec D, InsertMode is)
273 {
274   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)Y->data;
275   PetscBool   cong;
276 
277   PetscFunctionBegin;
278   PetscCall(MatHasCongruentLayouts(Y, &cong));
279   if (Y->assembled && cong) {
280     PetscCall(MatDiagonalSet(aij->A, D, is));
281   } else {
282     PetscCall(MatDiagonalSet_Default(Y, D, is));
283   }
284   PetscFunctionReturn(PETSC_SUCCESS);
285 }
286 
287 static PetscErrorCode MatFindZeroDiagonals_MPIAIJ(Mat M, IS *zrows)
288 {
289   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)M->data;
290   PetscInt    i, rstart, nrows, *rows;
291 
292   PetscFunctionBegin;
293   *zrows = NULL;
294   PetscCall(MatFindZeroDiagonals_SeqAIJ_Private(aij->A, &nrows, &rows));
295   PetscCall(MatGetOwnershipRange(M, &rstart, NULL));
296   for (i = 0; i < nrows; i++) rows[i] += rstart;
297   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)M), nrows, rows, PETSC_OWN_POINTER, zrows));
298   PetscFunctionReturn(PETSC_SUCCESS);
299 }
300 
301 static PetscErrorCode MatGetColumnReductions_MPIAIJ(Mat A, PetscInt type, PetscReal *reductions)
302 {
303   Mat_MPIAIJ        *aij = (Mat_MPIAIJ *)A->data;
304   PetscInt           i, m, n, *garray = aij->garray;
305   Mat_SeqAIJ        *a_aij = (Mat_SeqAIJ *)aij->A->data;
306   Mat_SeqAIJ        *b_aij = (Mat_SeqAIJ *)aij->B->data;
307   PetscReal         *work;
308   const PetscScalar *dummy;
309 
310   PetscFunctionBegin;
311   PetscCall(MatGetSize(A, &m, &n));
312   PetscCall(PetscCalloc1(n, &work));
313   PetscCall(MatSeqAIJGetArrayRead(aij->A, &dummy));
314   PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &dummy));
315   PetscCall(MatSeqAIJGetArrayRead(aij->B, &dummy));
316   PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &dummy));
317   if (type == NORM_2) {
318     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscAbsScalar(a_aij->a[i] * a_aij->a[i]);
319     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscAbsScalar(b_aij->a[i] * b_aij->a[i]);
320   } else if (type == NORM_1) {
321     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscAbsScalar(a_aij->a[i]);
322     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscAbsScalar(b_aij->a[i]);
323   } else if (type == NORM_INFINITY) {
324     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] = PetscMax(PetscAbsScalar(a_aij->a[i]), work[A->cmap->rstart + a_aij->j[i]]);
325     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] = PetscMax(PetscAbsScalar(b_aij->a[i]), work[garray[b_aij->j[i]]]);
326   } else if (type == REDUCTION_SUM_REALPART || type == REDUCTION_MEAN_REALPART) {
327     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscRealPart(a_aij->a[i]);
328     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscRealPart(b_aij->a[i]);
329   } else if (type == REDUCTION_SUM_IMAGINARYPART || type == REDUCTION_MEAN_IMAGINARYPART) {
330     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscImaginaryPart(a_aij->a[i]);
331     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscImaginaryPart(b_aij->a[i]);
332   } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unknown reduction type");
333   if (type == NORM_INFINITY) {
334     PetscCallMPI(MPIU_Allreduce(work, reductions, n, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)A)));
335   } else {
336     PetscCallMPI(MPIU_Allreduce(work, reductions, n, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)A)));
337   }
338   PetscCall(PetscFree(work));
339   if (type == NORM_2) {
340     for (i = 0; i < n; i++) reductions[i] = PetscSqrtReal(reductions[i]);
341   } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) {
342     for (i = 0; i < n; i++) reductions[i] /= m;
343   }
344   PetscFunctionReturn(PETSC_SUCCESS);
345 }
346 
347 static PetscErrorCode MatFindOffBlockDiagonalEntries_MPIAIJ(Mat A, IS *is)
348 {
349   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
350   IS              sis, gis;
351   const PetscInt *isis, *igis;
352   PetscInt        n, *iis, nsis, ngis, rstart, i;
353 
354   PetscFunctionBegin;
355   PetscCall(MatFindOffBlockDiagonalEntries(a->A, &sis));
356   PetscCall(MatFindNonzeroRows(a->B, &gis));
357   PetscCall(ISGetSize(gis, &ngis));
358   PetscCall(ISGetSize(sis, &nsis));
359   PetscCall(ISGetIndices(sis, &isis));
360   PetscCall(ISGetIndices(gis, &igis));
361 
362   PetscCall(PetscMalloc1(ngis + nsis, &iis));
363   PetscCall(PetscArraycpy(iis, igis, ngis));
364   PetscCall(PetscArraycpy(iis + ngis, isis, nsis));
365   n = ngis + nsis;
366   PetscCall(PetscSortRemoveDupsInt(&n, iis));
367   PetscCall(MatGetOwnershipRange(A, &rstart, NULL));
368   for (i = 0; i < n; i++) iis[i] += rstart;
369   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)A), n, iis, PETSC_OWN_POINTER, is));
370 
371   PetscCall(ISRestoreIndices(sis, &isis));
372   PetscCall(ISRestoreIndices(gis, &igis));
373   PetscCall(ISDestroy(&sis));
374   PetscCall(ISDestroy(&gis));
375   PetscFunctionReturn(PETSC_SUCCESS);
376 }
377 
378 /*
379   Local utility routine that creates a mapping from the global column
380 number to the local number in the off-diagonal part of the local
381 storage of the matrix.  When PETSC_USE_CTABLE is used this is scalable at
382 a slightly higher hash table cost; without it it is not scalable (each processor
383 has an order N integer array but is fast to access.
384 */
385 PetscErrorCode MatCreateColmap_MPIAIJ_Private(Mat mat)
386 {
387   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
388   PetscInt    n   = aij->B->cmap->n, i;
389 
390   PetscFunctionBegin;
391   PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
392 #if defined(PETSC_USE_CTABLE)
393   PetscCall(PetscHMapICreateWithSize(n, &aij->colmap));
394   for (i = 0; i < n; i++) PetscCall(PetscHMapISet(aij->colmap, aij->garray[i] + 1, i + 1));
395 #else
396   PetscCall(PetscCalloc1(mat->cmap->N + 1, &aij->colmap));
397   for (i = 0; i < n; i++) aij->colmap[aij->garray[i]] = i + 1;
398 #endif
399   PetscFunctionReturn(PETSC_SUCCESS);
400 }
401 
402 #define MatSetValues_SeqAIJ_A_Private(row, col, value, addv, orow, ocol) \
403   do { \
404     if (col <= lastcol1) low1 = 0; \
405     else high1 = nrow1; \
406     lastcol1 = col; \
407     while (high1 - low1 > 5) { \
408       t = (low1 + high1) / 2; \
409       if (rp1[t] > col) high1 = t; \
410       else low1 = t; \
411     } \
412     for (_i = low1; _i < high1; _i++) { \
413       if (rp1[_i] > col) break; \
414       if (rp1[_i] == col) { \
415         if (addv == ADD_VALUES) { \
416           ap1[_i] += value; \
417           /* Not sure LogFlops will slow down the code or not */ \
418           (void)PetscLogFlops(1.0); \
419         } else ap1[_i] = value; \
420         goto a_noinsert; \
421       } \
422     } \
423     if (value == 0.0 && ignorezeroentries && row != col) { \
424       low1  = 0; \
425       high1 = nrow1; \
426       goto a_noinsert; \
427     } \
428     if (nonew == 1) { \
429       low1  = 0; \
430       high1 = nrow1; \
431       goto a_noinsert; \
432     } \
433     PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
434     MatSeqXAIJReallocateAIJ(A, am, 1, nrow1, row, col, rmax1, aa, ai, aj, rp1, ap1, aimax, nonew, MatScalar); \
435     N = nrow1++ - 1; \
436     a->nz++; \
437     high1++; \
438     /* shift up all the later entries in this row */ \
439     PetscCall(PetscArraymove(rp1 + _i + 1, rp1 + _i, N - _i + 1)); \
440     PetscCall(PetscArraymove(ap1 + _i + 1, ap1 + _i, N - _i + 1)); \
441     rp1[_i] = col; \
442     ap1[_i] = value; \
443   a_noinsert:; \
444     ailen[row] = nrow1; \
445   } while (0)
446 
447 #define MatSetValues_SeqAIJ_B_Private(row, col, value, addv, orow, ocol) \
448   do { \
449     if (col <= lastcol2) low2 = 0; \
450     else high2 = nrow2; \
451     lastcol2 = col; \
452     while (high2 - low2 > 5) { \
453       t = (low2 + high2) / 2; \
454       if (rp2[t] > col) high2 = t; \
455       else low2 = t; \
456     } \
457     for (_i = low2; _i < high2; _i++) { \
458       if (rp2[_i] > col) break; \
459       if (rp2[_i] == col) { \
460         if (addv == ADD_VALUES) { \
461           ap2[_i] += value; \
462           (void)PetscLogFlops(1.0); \
463         } else ap2[_i] = value; \
464         goto b_noinsert; \
465       } \
466     } \
467     if (value == 0.0 && ignorezeroentries) { \
468       low2  = 0; \
469       high2 = nrow2; \
470       goto b_noinsert; \
471     } \
472     if (nonew == 1) { \
473       low2  = 0; \
474       high2 = nrow2; \
475       goto b_noinsert; \
476     } \
477     PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
478     MatSeqXAIJReallocateAIJ(B, bm, 1, nrow2, row, col, rmax2, ba, bi, bj, rp2, ap2, bimax, nonew, MatScalar); \
479     N = nrow2++ - 1; \
480     b->nz++; \
481     high2++; \
482     /* shift up all the later entries in this row */ \
483     PetscCall(PetscArraymove(rp2 + _i + 1, rp2 + _i, N - _i + 1)); \
484     PetscCall(PetscArraymove(ap2 + _i + 1, ap2 + _i, N - _i + 1)); \
485     rp2[_i] = col; \
486     ap2[_i] = value; \
487   b_noinsert:; \
488     bilen[row] = nrow2; \
489   } while (0)
490 
491 static PetscErrorCode MatSetValuesRow_MPIAIJ(Mat A, PetscInt row, const PetscScalar v[])
492 {
493   Mat_MPIAIJ  *mat = (Mat_MPIAIJ *)A->data;
494   Mat_SeqAIJ  *a = (Mat_SeqAIJ *)mat->A->data, *b = (Mat_SeqAIJ *)mat->B->data;
495   PetscInt     l, *garray                         = mat->garray, diag;
496   PetscScalar *aa, *ba;
497 
498   PetscFunctionBegin;
499   /* code only works for square matrices A */
500 
501   /* find size of row to the left of the diagonal part */
502   PetscCall(MatGetOwnershipRange(A, &diag, NULL));
503   row = row - diag;
504   for (l = 0; l < b->i[row + 1] - b->i[row]; l++) {
505     if (garray[b->j[b->i[row] + l]] > diag) break;
506   }
507   if (l) {
508     PetscCall(MatSeqAIJGetArray(mat->B, &ba));
509     PetscCall(PetscArraycpy(ba + b->i[row], v, l));
510     PetscCall(MatSeqAIJRestoreArray(mat->B, &ba));
511   }
512 
513   /* diagonal part */
514   if (a->i[row + 1] - a->i[row]) {
515     PetscCall(MatSeqAIJGetArray(mat->A, &aa));
516     PetscCall(PetscArraycpy(aa + a->i[row], v + l, a->i[row + 1] - a->i[row]));
517     PetscCall(MatSeqAIJRestoreArray(mat->A, &aa));
518   }
519 
520   /* right of diagonal part */
521   if (b->i[row + 1] - b->i[row] - l) {
522     PetscCall(MatSeqAIJGetArray(mat->B, &ba));
523     PetscCall(PetscArraycpy(ba + b->i[row] + l, v + l + a->i[row + 1] - a->i[row], b->i[row + 1] - b->i[row] - l));
524     PetscCall(MatSeqAIJRestoreArray(mat->B, &ba));
525   }
526   PetscFunctionReturn(PETSC_SUCCESS);
527 }
528 
529 PetscErrorCode MatSetValues_MPIAIJ(Mat mat, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode addv)
530 {
531   Mat_MPIAIJ *aij   = (Mat_MPIAIJ *)mat->data;
532   PetscScalar value = 0.0;
533   PetscInt    i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
534   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
535   PetscBool   roworiented = aij->roworiented;
536 
537   /* Some Variables required in the macro */
538   Mat         A     = aij->A;
539   Mat_SeqAIJ *a     = (Mat_SeqAIJ *)A->data;
540   PetscInt   *aimax = a->imax, *ai = a->i, *ailen = a->ilen, *aj = a->j;
541   PetscBool   ignorezeroentries = a->ignorezeroentries;
542   Mat         B                 = aij->B;
543   Mat_SeqAIJ *b                 = (Mat_SeqAIJ *)B->data;
544   PetscInt   *bimax = b->imax, *bi = b->i, *bilen = b->ilen, *bj = b->j, bm = aij->B->rmap->n, am = aij->A->rmap->n;
545   MatScalar  *aa, *ba;
546   PetscInt   *rp1, *rp2, ii, nrow1, nrow2, _i, rmax1, rmax2, N, low1, high1, low2, high2, t, lastcol1, lastcol2;
547   PetscInt    nonew;
548   MatScalar  *ap1, *ap2;
549 
550   PetscFunctionBegin;
551   PetscCall(MatSeqAIJGetArray(A, &aa));
552   PetscCall(MatSeqAIJGetArray(B, &ba));
553   for (i = 0; i < m; i++) {
554     if (im[i] < 0) continue;
555     PetscCheck(im[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, im[i], mat->rmap->N - 1);
556     if (im[i] >= rstart && im[i] < rend) {
557       row      = im[i] - rstart;
558       lastcol1 = -1;
559       rp1      = PetscSafePointerPlusOffset(aj, ai[row]);
560       ap1      = PetscSafePointerPlusOffset(aa, ai[row]);
561       rmax1    = aimax[row];
562       nrow1    = ailen[row];
563       low1     = 0;
564       high1    = nrow1;
565       lastcol2 = -1;
566       rp2      = PetscSafePointerPlusOffset(bj, bi[row]);
567       ap2      = PetscSafePointerPlusOffset(ba, bi[row]);
568       rmax2    = bimax[row];
569       nrow2    = bilen[row];
570       low2     = 0;
571       high2    = nrow2;
572 
573       for (j = 0; j < n; j++) {
574         if (v) value = roworiented ? v[i * n + j] : v[i + j * m];
575         if (ignorezeroentries && value == 0.0 && (addv == ADD_VALUES) && im[i] != in[j]) continue;
576         if (in[j] >= cstart && in[j] < cend) {
577           col   = in[j] - cstart;
578           nonew = a->nonew;
579           MatSetValues_SeqAIJ_A_Private(row, col, value, addv, im[i], in[j]);
580         } else if (in[j] < 0) {
581           continue;
582         } else {
583           PetscCheck(in[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[j], mat->cmap->N - 1);
584           if (mat->was_assembled) {
585             if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
586 #if defined(PETSC_USE_CTABLE)
587             PetscCall(PetscHMapIGetWithDefault(aij->colmap, in[j] + 1, 0, &col)); /* map global col ids to local ones */
588             col--;
589 #else
590             col = aij->colmap[in[j]] - 1;
591 #endif
592             if (col < 0 && !((Mat_SeqAIJ *)aij->B->data)->nonew) { /* col < 0 means in[j] is a new col for B */
593               PetscCall(MatDisAssemble_MPIAIJ(mat, PETSC_FALSE));  /* Change aij->B from reduced/local format to expanded/global format */
594               col = in[j];
595               /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
596               B     = aij->B;
597               b     = (Mat_SeqAIJ *)B->data;
598               bimax = b->imax;
599               bi    = b->i;
600               bilen = b->ilen;
601               bj    = b->j;
602               ba    = b->a;
603               rp2   = PetscSafePointerPlusOffset(bj, bi[row]);
604               ap2   = PetscSafePointerPlusOffset(ba, bi[row]);
605               rmax2 = bimax[row];
606               nrow2 = bilen[row];
607               low2  = 0;
608               high2 = nrow2;
609               bm    = aij->B->rmap->n;
610               ba    = b->a;
611             } else if (col < 0 && !(ignorezeroentries && value == 0.0)) {
612               if (1 == ((Mat_SeqAIJ *)aij->B->data)->nonew) {
613                 PetscCall(PetscInfo(mat, "Skipping of insertion of new nonzero location in off-diagonal portion of matrix %g(%" PetscInt_FMT ",%" PetscInt_FMT ")\n", (double)PetscRealPart(value), im[i], in[j]));
614               } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", im[i], in[j]);
615             }
616           } else col = in[j];
617           nonew = b->nonew;
618           MatSetValues_SeqAIJ_B_Private(row, col, value, addv, im[i], in[j]);
619         }
620       }
621     } else {
622       PetscCheck(!mat->nooffprocentries, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Setting off process row %" PetscInt_FMT " even though MatSetOption(,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) was set", im[i]);
623       if (!aij->donotstash) {
624         mat->assembled = PETSC_FALSE;
625         if (roworiented) {
626           PetscCall(MatStashValuesRow_Private(&mat->stash, im[i], n, in, PetscSafePointerPlusOffset(v, i * n), (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
627         } else {
628           PetscCall(MatStashValuesCol_Private(&mat->stash, im[i], n, in, PetscSafePointerPlusOffset(v, i), m, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
629         }
630       }
631     }
632   }
633   PetscCall(MatSeqAIJRestoreArray(A, &aa)); /* aa, bb might have been free'd due to reallocation above. But we don't access them here */
634   PetscCall(MatSeqAIJRestoreArray(B, &ba));
635   PetscFunctionReturn(PETSC_SUCCESS);
636 }
637 
638 /*
639     This function sets the j and ilen arrays (of the diagonal and off-diagonal part) of an MPIAIJ-matrix.
640     The values in mat_i have to be sorted and the values in mat_j have to be sorted for each row (CSR-like).
641     No off-processor parts off the matrix are allowed here and mat->was_assembled has to be PETSC_FALSE.
642 */
643 PetscErrorCode MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(Mat mat, const PetscInt mat_j[], const PetscInt mat_i[])
644 {
645   Mat_MPIAIJ *aij    = (Mat_MPIAIJ *)mat->data;
646   Mat         A      = aij->A; /* diagonal part of the matrix */
647   Mat         B      = aij->B; /* off-diagonal part of the matrix */
648   Mat_SeqAIJ *a      = (Mat_SeqAIJ *)A->data;
649   Mat_SeqAIJ *b      = (Mat_SeqAIJ *)B->data;
650   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, col;
651   PetscInt   *ailen = a->ilen, *aj = a->j;
652   PetscInt   *bilen = b->ilen, *bj = b->j;
653   PetscInt    am          = aij->A->rmap->n, j;
654   PetscInt    diag_so_far = 0, dnz;
655   PetscInt    offd_so_far = 0, onz;
656 
657   PetscFunctionBegin;
658   /* Iterate over all rows of the matrix */
659   for (j = 0; j < am; j++) {
660     dnz = onz = 0;
661     /*  Iterate over all non-zero columns of the current row */
662     for (col = mat_i[j]; col < mat_i[j + 1]; col++) {
663       /* If column is in the diagonal */
664       if (mat_j[col] >= cstart && mat_j[col] < cend) {
665         aj[diag_so_far++] = mat_j[col] - cstart;
666         dnz++;
667       } else { /* off-diagonal entries */
668         bj[offd_so_far++] = mat_j[col];
669         onz++;
670       }
671     }
672     ailen[j] = dnz;
673     bilen[j] = onz;
674   }
675   PetscFunctionReturn(PETSC_SUCCESS);
676 }
677 
678 /*
679     This function sets the local j, a and ilen arrays (of the diagonal and off-diagonal part) of an MPIAIJ-matrix.
680     The values in mat_i have to be sorted and the values in mat_j have to be sorted for each row (CSR-like).
681     No off-processor parts off the matrix are allowed here, they are set at a later point by MatSetValues_MPIAIJ.
682     Also, mat->was_assembled has to be false, otherwise the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart;
683     would not be true and the more complex MatSetValues_MPIAIJ has to be used.
684 */
685 PetscErrorCode MatSetValues_MPIAIJ_CopyFromCSRFormat(Mat mat, const PetscInt mat_j[], const PetscInt mat_i[], const PetscScalar mat_a[])
686 {
687   Mat_MPIAIJ  *aij  = (Mat_MPIAIJ *)mat->data;
688   Mat          A    = aij->A; /* diagonal part of the matrix */
689   Mat          B    = aij->B; /* off-diagonal part of the matrix */
690   Mat_SeqAIJ  *aijd = (Mat_SeqAIJ *)aij->A->data, *aijo = (Mat_SeqAIJ *)aij->B->data;
691   Mat_SeqAIJ  *a      = (Mat_SeqAIJ *)A->data;
692   Mat_SeqAIJ  *b      = (Mat_SeqAIJ *)B->data;
693   PetscInt     cstart = mat->cmap->rstart, cend = mat->cmap->rend;
694   PetscInt    *ailen = a->ilen, *aj = a->j;
695   PetscInt    *bilen = b->ilen, *bj = b->j;
696   PetscInt     am          = aij->A->rmap->n, j;
697   PetscInt    *full_diag_i = aijd->i, *full_offd_i = aijo->i; /* These variables can also include non-local elements, which are set at a later point. */
698   PetscInt     col, dnz_row, onz_row, rowstart_diag, rowstart_offd;
699   PetscScalar *aa = a->a, *ba = b->a;
700 
701   PetscFunctionBegin;
702   /* Iterate over all rows of the matrix */
703   for (j = 0; j < am; j++) {
704     dnz_row = onz_row = 0;
705     rowstart_offd     = full_offd_i[j];
706     rowstart_diag     = full_diag_i[j];
707     /*  Iterate over all non-zero columns of the current row */
708     for (col = mat_i[j]; col < mat_i[j + 1]; col++) {
709       /* If column is in the diagonal */
710       if (mat_j[col] >= cstart && mat_j[col] < cend) {
711         aj[rowstart_diag + dnz_row] = mat_j[col] - cstart;
712         aa[rowstart_diag + dnz_row] = mat_a[col];
713         dnz_row++;
714       } else { /* off-diagonal entries */
715         bj[rowstart_offd + onz_row] = mat_j[col];
716         ba[rowstart_offd + onz_row] = mat_a[col];
717         onz_row++;
718       }
719     }
720     ailen[j] = dnz_row;
721     bilen[j] = onz_row;
722   }
723   PetscFunctionReturn(PETSC_SUCCESS);
724 }
725 
726 static PetscErrorCode MatGetValues_MPIAIJ(Mat mat, PetscInt m, const PetscInt idxm[], PetscInt n, const PetscInt idxn[], PetscScalar v[])
727 {
728   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
729   PetscInt    i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
730   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
731 
732   PetscFunctionBegin;
733   for (i = 0; i < m; i++) {
734     if (idxm[i] < 0) continue; /* negative row */
735     PetscCheck(idxm[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, idxm[i], mat->rmap->N - 1);
736     PetscCheck(idxm[i] >= rstart && idxm[i] < rend, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only local values currently supported, row requested %" PetscInt_FMT " range [%" PetscInt_FMT " %" PetscInt_FMT ")", idxm[i], rstart, rend);
737     row = idxm[i] - rstart;
738     for (j = 0; j < n; j++) {
739       if (idxn[j] < 0) continue; /* negative column */
740       PetscCheck(idxn[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, idxn[j], mat->cmap->N - 1);
741       if (idxn[j] >= cstart && idxn[j] < cend) {
742         col = idxn[j] - cstart;
743         PetscCall(MatGetValues(aij->A, 1, &row, 1, &col, v + i * n + j));
744       } else {
745         if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
746 #if defined(PETSC_USE_CTABLE)
747         PetscCall(PetscHMapIGetWithDefault(aij->colmap, idxn[j] + 1, 0, &col));
748         col--;
749 #else
750         col = aij->colmap[idxn[j]] - 1;
751 #endif
752         if ((col < 0) || (aij->garray[col] != idxn[j])) *(v + i * n + j) = 0.0;
753         else PetscCall(MatGetValues(aij->B, 1, &row, 1, &col, v + i * n + j));
754       }
755     }
756   }
757   PetscFunctionReturn(PETSC_SUCCESS);
758 }
759 
760 static PetscErrorCode MatAssemblyBegin_MPIAIJ(Mat mat, MatAssemblyType mode)
761 {
762   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
763   PetscInt    nstash, reallocs;
764 
765   PetscFunctionBegin;
766   if (aij->donotstash || mat->nooffprocentries) PetscFunctionReturn(PETSC_SUCCESS);
767 
768   PetscCall(MatStashScatterBegin_Private(mat, &mat->stash, mat->rmap->range));
769   PetscCall(MatStashGetInfo_Private(&mat->stash, &nstash, &reallocs));
770   PetscCall(PetscInfo(aij->A, "Stash has %" PetscInt_FMT " entries, uses %" PetscInt_FMT " mallocs.\n", nstash, reallocs));
771   PetscFunctionReturn(PETSC_SUCCESS);
772 }
773 
774 PetscErrorCode MatAssemblyEnd_MPIAIJ(Mat mat, MatAssemblyType mode)
775 {
776   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *)mat->data;
777   PetscMPIInt  n;
778   PetscInt     i, j, rstart, ncols, flg;
779   PetscInt    *row, *col;
780   PetscBool    all_assembled;
781   PetscScalar *val;
782 
783   /* do not use 'b = (Mat_SeqAIJ*)aij->B->data' as B can be reset in disassembly */
784 
785   PetscFunctionBegin;
786   if (!aij->donotstash && !mat->nooffprocentries) {
787     while (1) {
788       PetscCall(MatStashScatterGetMesg_Private(&mat->stash, &n, &row, &col, &val, &flg));
789       if (!flg) break;
790 
791       for (i = 0; i < n;) {
792         /* Now identify the consecutive vals belonging to the same row */
793         for (j = i, rstart = row[j]; j < n; j++) {
794           if (row[j] != rstart) break;
795         }
796         if (j < n) ncols = j - i;
797         else ncols = n - i;
798         /* Now assemble all these values with a single function call */
799         PetscCall(MatSetValues_MPIAIJ(mat, 1, row + i, ncols, col + i, val + i, mat->insertmode));
800         i = j;
801       }
802     }
803     PetscCall(MatStashScatterEnd_Private(&mat->stash));
804   }
805 #if defined(PETSC_HAVE_DEVICE)
806   if (mat->offloadmask == PETSC_OFFLOAD_CPU) aij->A->offloadmask = PETSC_OFFLOAD_CPU;
807   /* We call MatBindToCPU() on aij->A and aij->B here, because if MatBindToCPU_MPIAIJ() is called before assembly, it cannot bind these. */
808   if (mat->boundtocpu) {
809     PetscCall(MatBindToCPU(aij->A, PETSC_TRUE));
810     PetscCall(MatBindToCPU(aij->B, PETSC_TRUE));
811   }
812 #endif
813   PetscCall(MatAssemblyBegin(aij->A, mode));
814   PetscCall(MatAssemblyEnd(aij->A, mode));
815 
816   /* determine if any process has disassembled, if so we must
817      also disassemble ourself, in order that we may reassemble. */
818   /*
819      if nonzero structure of submatrix B cannot change then we know that
820      no process disassembled thus we can skip this stuff
821   */
822   if (!((Mat_SeqAIJ *)aij->B->data)->nonew) {
823     PetscCallMPI(MPIU_Allreduce(&mat->was_assembled, &all_assembled, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
824     if (mat->was_assembled && !all_assembled) { /* mat on this rank has reduced off-diag B with local col ids, but globally it does not */
825       PetscCall(MatDisAssemble_MPIAIJ(mat, PETSC_FALSE));
826     }
827   }
828   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) PetscCall(MatSetUpMultiply_MPIAIJ(mat));
829   PetscCall(MatSetOption(aij->B, MAT_USE_INODES, PETSC_FALSE));
830 #if defined(PETSC_HAVE_DEVICE)
831   if (mat->offloadmask == PETSC_OFFLOAD_CPU && aij->B->offloadmask != PETSC_OFFLOAD_UNALLOCATED) aij->B->offloadmask = PETSC_OFFLOAD_CPU;
832 #endif
833   PetscCall(MatAssemblyBegin(aij->B, mode));
834   PetscCall(MatAssemblyEnd(aij->B, mode));
835 
836   PetscCall(PetscFree2(aij->rowvalues, aij->rowindices));
837 
838   aij->rowvalues = NULL;
839 
840   PetscCall(VecDestroy(&aij->diag));
841 
842   /* if no new nonzero locations are allowed in matrix then only set the matrix state the first time through */
843   if ((!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) || !((Mat_SeqAIJ *)aij->A->data)->nonew) {
844     PetscObjectState state = aij->A->nonzerostate + aij->B->nonzerostate;
845     PetscCallMPI(MPIU_Allreduce(&state, &mat->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)mat)));
846   }
847 #if defined(PETSC_HAVE_DEVICE)
848   mat->offloadmask = PETSC_OFFLOAD_BOTH;
849 #endif
850   PetscFunctionReturn(PETSC_SUCCESS);
851 }
852 
853 static PetscErrorCode MatZeroEntries_MPIAIJ(Mat A)
854 {
855   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
856 
857   PetscFunctionBegin;
858   PetscCall(MatZeroEntries(l->A));
859   PetscCall(MatZeroEntries(l->B));
860   PetscFunctionReturn(PETSC_SUCCESS);
861 }
862 
863 static PetscErrorCode MatZeroRows_MPIAIJ(Mat A, PetscInt N, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
864 {
865   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)A->data;
866   PetscInt   *lrows;
867   PetscInt    r, len;
868   PetscBool   cong;
869 
870   PetscFunctionBegin;
871   /* get locally owned rows */
872   PetscCall(MatZeroRowsMapLocal_Private(A, N, rows, &len, &lrows));
873   PetscCall(MatHasCongruentLayouts(A, &cong));
874   /* fix right-hand side if needed */
875   if (x && b) {
876     const PetscScalar *xx;
877     PetscScalar       *bb;
878 
879     PetscCheck(cong, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need matching row/col layout");
880     PetscCall(VecGetArrayRead(x, &xx));
881     PetscCall(VecGetArray(b, &bb));
882     for (r = 0; r < len; ++r) bb[lrows[r]] = diag * xx[lrows[r]];
883     PetscCall(VecRestoreArrayRead(x, &xx));
884     PetscCall(VecRestoreArray(b, &bb));
885   }
886 
887   if (diag != 0.0 && cong) {
888     PetscCall(MatZeroRows(mat->A, len, lrows, diag, NULL, NULL));
889     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
890   } else if (diag != 0.0) { /* non-square or non congruent layouts -> if keepnonzeropattern is false, we allow for new insertion */
891     Mat_SeqAIJ *aijA = (Mat_SeqAIJ *)mat->A->data;
892     Mat_SeqAIJ *aijB = (Mat_SeqAIJ *)mat->B->data;
893     PetscInt    nnwA, nnwB;
894     PetscBool   nnzA, nnzB;
895 
896     nnwA = aijA->nonew;
897     nnwB = aijB->nonew;
898     nnzA = aijA->keepnonzeropattern;
899     nnzB = aijB->keepnonzeropattern;
900     if (!nnzA) {
901       PetscCall(PetscInfo(mat->A, "Requested to not keep the pattern and add a nonzero diagonal; may encounter reallocations on diagonal block.\n"));
902       aijA->nonew = 0;
903     }
904     if (!nnzB) {
905       PetscCall(PetscInfo(mat->B, "Requested to not keep the pattern and add a nonzero diagonal; may encounter reallocations on off-diagonal block.\n"));
906       aijB->nonew = 0;
907     }
908     /* Must zero here before the next loop */
909     PetscCall(MatZeroRows(mat->A, len, lrows, 0.0, NULL, NULL));
910     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
911     for (r = 0; r < len; ++r) {
912       const PetscInt row = lrows[r] + A->rmap->rstart;
913       if (row >= A->cmap->N) continue;
914       PetscCall(MatSetValues(A, 1, &row, 1, &row, &diag, INSERT_VALUES));
915     }
916     aijA->nonew = nnwA;
917     aijB->nonew = nnwB;
918   } else {
919     PetscCall(MatZeroRows(mat->A, len, lrows, 0.0, NULL, NULL));
920     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
921   }
922   PetscCall(PetscFree(lrows));
923   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
924   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
925 
926   /* only change matrix nonzero state if pattern was allowed to be changed */
927   if (!((Mat_SeqAIJ *)mat->A->data)->keepnonzeropattern || !((Mat_SeqAIJ *)mat->A->data)->nonew) {
928     PetscObjectState state = mat->A->nonzerostate + mat->B->nonzerostate;
929     PetscCallMPI(MPIU_Allreduce(&state, &A->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)A)));
930   }
931   PetscFunctionReturn(PETSC_SUCCESS);
932 }
933 
934 static PetscErrorCode MatZeroRowsColumns_MPIAIJ(Mat A, PetscInt N, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
935 {
936   Mat_MPIAIJ        *l = (Mat_MPIAIJ *)A->data;
937   PetscInt           n = A->rmap->n;
938   PetscInt           i, j, r, m, len = 0;
939   PetscInt          *lrows, *owners = A->rmap->range;
940   PetscMPIInt        p = 0;
941   PetscSFNode       *rrows;
942   PetscSF            sf;
943   const PetscScalar *xx;
944   PetscScalar       *bb, *mask, *aij_a;
945   Vec                xmask, lmask;
946   Mat_SeqAIJ        *aij = (Mat_SeqAIJ *)l->B->data;
947   const PetscInt    *aj, *ii, *ridx;
948   PetscScalar       *aa;
949 
950   PetscFunctionBegin;
951   /* Create SF where leaves are input rows and roots are owned rows */
952   PetscCall(PetscMalloc1(n, &lrows));
953   for (r = 0; r < n; ++r) lrows[r] = -1;
954   PetscCall(PetscMalloc1(N, &rrows));
955   for (r = 0; r < N; ++r) {
956     const PetscInt idx = rows[r];
957     PetscCheck(idx >= 0 && A->rmap->N > idx, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range [0,%" PetscInt_FMT ")", idx, A->rmap->N);
958     if (idx < owners[p] || owners[p + 1] <= idx) { /* short-circuit the search if the last p owns this row too */
959       PetscCall(PetscLayoutFindOwner(A->rmap, idx, &p));
960     }
961     rrows[r].rank  = p;
962     rrows[r].index = rows[r] - owners[p];
963   }
964   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
965   PetscCall(PetscSFSetGraph(sf, n, N, NULL, PETSC_OWN_POINTER, rrows, PETSC_OWN_POINTER));
966   /* Collect flags for rows to be zeroed */
967   PetscCall(PetscSFReduceBegin(sf, MPIU_INT, (PetscInt *)rows, lrows, MPI_LOR));
968   PetscCall(PetscSFReduceEnd(sf, MPIU_INT, (PetscInt *)rows, lrows, MPI_LOR));
969   PetscCall(PetscSFDestroy(&sf));
970   /* Compress and put in row numbers */
971   for (r = 0; r < n; ++r)
972     if (lrows[r] >= 0) lrows[len++] = r;
973   /* zero diagonal part of matrix */
974   PetscCall(MatZeroRowsColumns(l->A, len, lrows, diag, x, b));
975   /* handle off-diagonal part of matrix */
976   PetscCall(MatCreateVecs(A, &xmask, NULL));
977   PetscCall(VecDuplicate(l->lvec, &lmask));
978   PetscCall(VecGetArray(xmask, &bb));
979   for (i = 0; i < len; i++) bb[lrows[i]] = 1;
980   PetscCall(VecRestoreArray(xmask, &bb));
981   PetscCall(VecScatterBegin(l->Mvctx, xmask, lmask, ADD_VALUES, SCATTER_FORWARD));
982   PetscCall(VecScatterEnd(l->Mvctx, xmask, lmask, ADD_VALUES, SCATTER_FORWARD));
983   PetscCall(VecDestroy(&xmask));
984   if (x && b) { /* this code is buggy when the row and column layout don't match */
985     PetscBool cong;
986 
987     PetscCall(MatHasCongruentLayouts(A, &cong));
988     PetscCheck(cong, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need matching row/col layout");
989     PetscCall(VecScatterBegin(l->Mvctx, x, l->lvec, INSERT_VALUES, SCATTER_FORWARD));
990     PetscCall(VecScatterEnd(l->Mvctx, x, l->lvec, INSERT_VALUES, SCATTER_FORWARD));
991     PetscCall(VecGetArrayRead(l->lvec, &xx));
992     PetscCall(VecGetArray(b, &bb));
993   }
994   PetscCall(VecGetArray(lmask, &mask));
995   /* remove zeroed rows of off-diagonal matrix */
996   PetscCall(MatSeqAIJGetArray(l->B, &aij_a));
997   ii = aij->i;
998   for (i = 0; i < len; i++) PetscCall(PetscArrayzero(PetscSafePointerPlusOffset(aij_a, ii[lrows[i]]), ii[lrows[i] + 1] - ii[lrows[i]]));
999   /* loop over all elements of off process part of matrix zeroing removed columns*/
1000   if (aij->compressedrow.use) {
1001     m    = aij->compressedrow.nrows;
1002     ii   = aij->compressedrow.i;
1003     ridx = aij->compressedrow.rindex;
1004     for (i = 0; i < m; i++) {
1005       n  = ii[i + 1] - ii[i];
1006       aj = aij->j + ii[i];
1007       aa = aij_a + ii[i];
1008 
1009       for (j = 0; j < n; j++) {
1010         if (PetscAbsScalar(mask[*aj])) {
1011           if (b) bb[*ridx] -= *aa * xx[*aj];
1012           *aa = 0.0;
1013         }
1014         aa++;
1015         aj++;
1016       }
1017       ridx++;
1018     }
1019   } else { /* do not use compressed row format */
1020     m = l->B->rmap->n;
1021     for (i = 0; i < m; i++) {
1022       n  = ii[i + 1] - ii[i];
1023       aj = aij->j + ii[i];
1024       aa = aij_a + ii[i];
1025       for (j = 0; j < n; j++) {
1026         if (PetscAbsScalar(mask[*aj])) {
1027           if (b) bb[i] -= *aa * xx[*aj];
1028           *aa = 0.0;
1029         }
1030         aa++;
1031         aj++;
1032       }
1033     }
1034   }
1035   if (x && b) {
1036     PetscCall(VecRestoreArray(b, &bb));
1037     PetscCall(VecRestoreArrayRead(l->lvec, &xx));
1038   }
1039   PetscCall(MatSeqAIJRestoreArray(l->B, &aij_a));
1040   PetscCall(VecRestoreArray(lmask, &mask));
1041   PetscCall(VecDestroy(&lmask));
1042   PetscCall(PetscFree(lrows));
1043 
1044   /* only change matrix nonzero state if pattern was allowed to be changed */
1045   if (!((Mat_SeqAIJ *)l->A->data)->nonew) {
1046     PetscObjectState state = l->A->nonzerostate + l->B->nonzerostate;
1047     PetscCallMPI(MPIU_Allreduce(&state, &A->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)A)));
1048   }
1049   PetscFunctionReturn(PETSC_SUCCESS);
1050 }
1051 
1052 static PetscErrorCode MatMult_MPIAIJ(Mat A, Vec xx, Vec yy)
1053 {
1054   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1055   PetscInt    nt;
1056   VecScatter  Mvctx = a->Mvctx;
1057 
1058   PetscFunctionBegin;
1059   PetscCall(VecGetLocalSize(xx, &nt));
1060   PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt);
1061   PetscCall(VecScatterBegin(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1062   PetscUseTypeMethod(a->A, mult, xx, yy);
1063   PetscCall(VecScatterEnd(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1064   PetscUseTypeMethod(a->B, multadd, a->lvec, yy, yy);
1065   PetscFunctionReturn(PETSC_SUCCESS);
1066 }
1067 
1068 static PetscErrorCode MatMultDiagonalBlock_MPIAIJ(Mat A, Vec bb, Vec xx)
1069 {
1070   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1071 
1072   PetscFunctionBegin;
1073   PetscCall(MatMultDiagonalBlock(a->A, bb, xx));
1074   PetscFunctionReturn(PETSC_SUCCESS);
1075 }
1076 
1077 static PetscErrorCode MatMultAdd_MPIAIJ(Mat A, Vec xx, Vec yy, Vec zz)
1078 {
1079   Mat_MPIAIJ *a     = (Mat_MPIAIJ *)A->data;
1080   VecScatter  Mvctx = a->Mvctx;
1081 
1082   PetscFunctionBegin;
1083   PetscCall(VecScatterBegin(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1084   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
1085   PetscCall(VecScatterEnd(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1086   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
1087   PetscFunctionReturn(PETSC_SUCCESS);
1088 }
1089 
1090 static PetscErrorCode MatMultTranspose_MPIAIJ(Mat A, Vec xx, Vec yy)
1091 {
1092   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1093 
1094   PetscFunctionBegin;
1095   /* do nondiagonal part */
1096   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
1097   /* do local part */
1098   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
1099   /* add partial results together */
1100   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
1101   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
1102   PetscFunctionReturn(PETSC_SUCCESS);
1103 }
1104 
1105 static PetscErrorCode MatIsTranspose_MPIAIJ(Mat Amat, Mat Bmat, PetscReal tol, PetscBool *f)
1106 {
1107   MPI_Comm    comm;
1108   Mat_MPIAIJ *Aij = (Mat_MPIAIJ *)Amat->data, *Bij = (Mat_MPIAIJ *)Bmat->data;
1109   Mat         Adia = Aij->A, Bdia = Bij->A, Aoff, Boff, *Aoffs, *Boffs;
1110   IS          Me, Notme;
1111   PetscInt    M, N, first, last, *notme, i;
1112   PetscBool   lf;
1113   PetscMPIInt size;
1114 
1115   PetscFunctionBegin;
1116   /* Easy test: symmetric diagonal block */
1117   PetscCall(MatIsTranspose(Adia, Bdia, tol, &lf));
1118   PetscCallMPI(MPIU_Allreduce(&lf, f, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)Amat)));
1119   if (!*f) PetscFunctionReturn(PETSC_SUCCESS);
1120   PetscCall(PetscObjectGetComm((PetscObject)Amat, &comm));
1121   PetscCallMPI(MPI_Comm_size(comm, &size));
1122   if (size == 1) PetscFunctionReturn(PETSC_SUCCESS);
1123 
1124   /* Hard test: off-diagonal block. This takes a MatCreateSubMatrix. */
1125   PetscCall(MatGetSize(Amat, &M, &N));
1126   PetscCall(MatGetOwnershipRange(Amat, &first, &last));
1127   PetscCall(PetscMalloc1(N - last + first, &notme));
1128   for (i = 0; i < first; i++) notme[i] = i;
1129   for (i = last; i < M; i++) notme[i - last + first] = i;
1130   PetscCall(ISCreateGeneral(MPI_COMM_SELF, N - last + first, notme, PETSC_COPY_VALUES, &Notme));
1131   PetscCall(ISCreateStride(MPI_COMM_SELF, last - first, first, 1, &Me));
1132   PetscCall(MatCreateSubMatrices(Amat, 1, &Me, &Notme, MAT_INITIAL_MATRIX, &Aoffs));
1133   Aoff = Aoffs[0];
1134   PetscCall(MatCreateSubMatrices(Bmat, 1, &Notme, &Me, MAT_INITIAL_MATRIX, &Boffs));
1135   Boff = Boffs[0];
1136   PetscCall(MatIsTranspose(Aoff, Boff, tol, f));
1137   PetscCall(MatDestroyMatrices(1, &Aoffs));
1138   PetscCall(MatDestroyMatrices(1, &Boffs));
1139   PetscCall(ISDestroy(&Me));
1140   PetscCall(ISDestroy(&Notme));
1141   PetscCall(PetscFree(notme));
1142   PetscFunctionReturn(PETSC_SUCCESS);
1143 }
1144 
1145 static PetscErrorCode MatMultTransposeAdd_MPIAIJ(Mat A, Vec xx, Vec yy, Vec zz)
1146 {
1147   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1148 
1149   PetscFunctionBegin;
1150   /* do nondiagonal part */
1151   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
1152   /* do local part */
1153   PetscCall((*a->A->ops->multtransposeadd)(a->A, xx, yy, zz));
1154   /* add partial results together */
1155   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, zz, ADD_VALUES, SCATTER_REVERSE));
1156   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, zz, ADD_VALUES, SCATTER_REVERSE));
1157   PetscFunctionReturn(PETSC_SUCCESS);
1158 }
1159 
1160 /*
1161   This only works correctly for square matrices where the subblock A->A is the
1162    diagonal block
1163 */
1164 static PetscErrorCode MatGetDiagonal_MPIAIJ(Mat A, Vec v)
1165 {
1166   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1167 
1168   PetscFunctionBegin;
1169   PetscCheck(A->rmap->N == A->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Supports only square matrix where A->A is diag block");
1170   PetscCheck(A->rmap->rstart == A->cmap->rstart && A->rmap->rend == A->cmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "row partition must equal col partition");
1171   PetscCall(MatGetDiagonal(a->A, v));
1172   PetscFunctionReturn(PETSC_SUCCESS);
1173 }
1174 
1175 static PetscErrorCode MatScale_MPIAIJ(Mat A, PetscScalar aa)
1176 {
1177   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1178 
1179   PetscFunctionBegin;
1180   PetscCall(MatScale(a->A, aa));
1181   PetscCall(MatScale(a->B, aa));
1182   PetscFunctionReturn(PETSC_SUCCESS);
1183 }
1184 
1185 static PetscErrorCode MatView_MPIAIJ_Binary(Mat mat, PetscViewer viewer)
1186 {
1187   Mat_MPIAIJ        *aij    = (Mat_MPIAIJ *)mat->data;
1188   Mat_SeqAIJ        *A      = (Mat_SeqAIJ *)aij->A->data;
1189   Mat_SeqAIJ        *B      = (Mat_SeqAIJ *)aij->B->data;
1190   const PetscInt    *garray = aij->garray;
1191   const PetscScalar *aa, *ba;
1192   PetscInt           header[4], M, N, m, rs, cs, cnt, i, ja, jb;
1193   PetscInt64         nz, hnz;
1194   PetscInt          *rowlens;
1195   PetscInt          *colidxs;
1196   PetscScalar       *matvals;
1197   PetscMPIInt        rank;
1198 
1199   PetscFunctionBegin;
1200   PetscCall(PetscViewerSetUp(viewer));
1201 
1202   M  = mat->rmap->N;
1203   N  = mat->cmap->N;
1204   m  = mat->rmap->n;
1205   rs = mat->rmap->rstart;
1206   cs = mat->cmap->rstart;
1207   nz = A->nz + B->nz;
1208 
1209   /* write matrix header */
1210   header[0] = MAT_FILE_CLASSID;
1211   header[1] = M;
1212   header[2] = N;
1213   PetscCallMPI(MPI_Reduce(&nz, &hnz, 1, MPIU_INT64, MPI_SUM, 0, PetscObjectComm((PetscObject)mat)));
1214   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
1215   if (rank == 0) PetscCall(PetscIntCast(hnz, &header[3]));
1216   PetscCall(PetscViewerBinaryWrite(viewer, header, 4, PETSC_INT));
1217 
1218   /* fill in and store row lengths  */
1219   PetscCall(PetscMalloc1(m, &rowlens));
1220   for (i = 0; i < m; i++) rowlens[i] = A->i[i + 1] - A->i[i] + B->i[i + 1] - B->i[i];
1221   PetscCall(PetscViewerBinaryWriteAll(viewer, rowlens, m, rs, M, PETSC_INT));
1222   PetscCall(PetscFree(rowlens));
1223 
1224   /* fill in and store column indices */
1225   PetscCall(PetscMalloc1(nz, &colidxs));
1226   for (cnt = 0, i = 0; i < m; i++) {
1227     for (jb = B->i[i]; jb < B->i[i + 1]; jb++) {
1228       if (garray[B->j[jb]] > cs) break;
1229       colidxs[cnt++] = garray[B->j[jb]];
1230     }
1231     for (ja = A->i[i]; ja < A->i[i + 1]; ja++) colidxs[cnt++] = A->j[ja] + cs;
1232     for (; jb < B->i[i + 1]; jb++) colidxs[cnt++] = garray[B->j[jb]];
1233   }
1234   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt64_FMT, cnt, nz);
1235   PetscCall(PetscViewerBinaryWriteAll(viewer, colidxs, nz, PETSC_DETERMINE, PETSC_DETERMINE, PETSC_INT));
1236   PetscCall(PetscFree(colidxs));
1237 
1238   /* fill in and store nonzero values */
1239   PetscCall(MatSeqAIJGetArrayRead(aij->A, &aa));
1240   PetscCall(MatSeqAIJGetArrayRead(aij->B, &ba));
1241   PetscCall(PetscMalloc1(nz, &matvals));
1242   for (cnt = 0, i = 0; i < m; i++) {
1243     for (jb = B->i[i]; jb < B->i[i + 1]; jb++) {
1244       if (garray[B->j[jb]] > cs) break;
1245       matvals[cnt++] = ba[jb];
1246     }
1247     for (ja = A->i[i]; ja < A->i[i + 1]; ja++) matvals[cnt++] = aa[ja];
1248     for (; jb < B->i[i + 1]; jb++) matvals[cnt++] = ba[jb];
1249   }
1250   PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &aa));
1251   PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &ba));
1252   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_LIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt64_FMT, cnt, nz);
1253   PetscCall(PetscViewerBinaryWriteAll(viewer, matvals, nz, PETSC_DETERMINE, PETSC_DETERMINE, PETSC_SCALAR));
1254   PetscCall(PetscFree(matvals));
1255 
1256   /* write block size option to the viewer's .info file */
1257   PetscCall(MatView_Binary_BlockSizes(mat, viewer));
1258   PetscFunctionReturn(PETSC_SUCCESS);
1259 }
1260 
1261 #include <petscdraw.h>
1262 static PetscErrorCode MatView_MPIAIJ_ASCIIorDraworSocket(Mat mat, PetscViewer viewer)
1263 {
1264   Mat_MPIAIJ       *aij  = (Mat_MPIAIJ *)mat->data;
1265   PetscMPIInt       rank = aij->rank, size = aij->size;
1266   PetscBool         isdraw, iascii, isbinary;
1267   PetscViewer       sviewer;
1268   PetscViewerFormat format;
1269 
1270   PetscFunctionBegin;
1271   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1272   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1273   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1274   if (iascii) {
1275     PetscCall(PetscViewerGetFormat(viewer, &format));
1276     if (format == PETSC_VIEWER_LOAD_BALANCE) {
1277       PetscInt i, nmax = 0, nmin = PETSC_INT_MAX, navg = 0, *nz, nzlocal = ((Mat_SeqAIJ *)aij->A->data)->nz + ((Mat_SeqAIJ *)aij->B->data)->nz;
1278       PetscCall(PetscMalloc1(size, &nz));
1279       PetscCallMPI(MPI_Allgather(&nzlocal, 1, MPIU_INT, nz, 1, MPIU_INT, PetscObjectComm((PetscObject)mat)));
1280       for (i = 0; i < size; i++) {
1281         nmax = PetscMax(nmax, nz[i]);
1282         nmin = PetscMin(nmin, nz[i]);
1283         navg += nz[i];
1284       }
1285       PetscCall(PetscFree(nz));
1286       navg = navg / size;
1287       PetscCall(PetscViewerASCIIPrintf(viewer, "Load Balance - Nonzeros: Min %" PetscInt_FMT "  avg %" PetscInt_FMT "  max %" PetscInt_FMT "\n", nmin, navg, nmax));
1288       PetscFunctionReturn(PETSC_SUCCESS);
1289     }
1290     PetscCall(PetscViewerGetFormat(viewer, &format));
1291     if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
1292       MatInfo   info;
1293       PetscInt *inodes = NULL;
1294 
1295       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
1296       PetscCall(MatGetInfo(mat, MAT_LOCAL, &info));
1297       PetscCall(MatInodeGetInodeSizes(aij->A, NULL, &inodes, NULL));
1298       PetscCall(PetscViewerASCIIPushSynchronized(viewer));
1299       if (!inodes) {
1300         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %g, not using I-node routines\n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated,
1301                                                      info.memory));
1302       } else {
1303         PetscCall(
1304           PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %g, using I-node routines\n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated, info.memory));
1305       }
1306       PetscCall(MatGetInfo(aij->A, MAT_LOCAL, &info));
1307       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] on-diagonal part: nz %" PetscInt_FMT " \n", rank, (PetscInt)info.nz_used));
1308       PetscCall(MatGetInfo(aij->B, MAT_LOCAL, &info));
1309       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] off-diagonal part: nz %" PetscInt_FMT " \n", rank, (PetscInt)info.nz_used));
1310       PetscCall(PetscViewerFlush(viewer));
1311       PetscCall(PetscViewerASCIIPopSynchronized(viewer));
1312       PetscCall(PetscViewerASCIIPrintf(viewer, "Information on VecScatter used in matrix-vector product: \n"));
1313       PetscCall(VecScatterView(aij->Mvctx, viewer));
1314       PetscFunctionReturn(PETSC_SUCCESS);
1315     } else if (format == PETSC_VIEWER_ASCII_INFO) {
1316       PetscInt inodecount, inodelimit, *inodes;
1317       PetscCall(MatInodeGetInodeSizes(aij->A, &inodecount, &inodes, &inodelimit));
1318       if (inodes) {
1319         PetscCall(PetscViewerASCIIPrintf(viewer, "using I-node (on process 0) routines: found %" PetscInt_FMT " nodes, limit used is %" PetscInt_FMT "\n", inodecount, inodelimit));
1320       } else {
1321         PetscCall(PetscViewerASCIIPrintf(viewer, "not using I-node (on process 0) routines\n"));
1322       }
1323       PetscFunctionReturn(PETSC_SUCCESS);
1324     } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO) {
1325       PetscFunctionReturn(PETSC_SUCCESS);
1326     }
1327   } else if (isbinary) {
1328     if (size == 1) {
1329       PetscCall(PetscObjectSetName((PetscObject)aij->A, ((PetscObject)mat)->name));
1330       PetscCall(MatView(aij->A, viewer));
1331     } else {
1332       PetscCall(MatView_MPIAIJ_Binary(mat, viewer));
1333     }
1334     PetscFunctionReturn(PETSC_SUCCESS);
1335   } else if (iascii && size == 1) {
1336     PetscCall(PetscObjectSetName((PetscObject)aij->A, ((PetscObject)mat)->name));
1337     PetscCall(MatView(aij->A, viewer));
1338     PetscFunctionReturn(PETSC_SUCCESS);
1339   } else if (isdraw) {
1340     PetscDraw draw;
1341     PetscBool isnull;
1342     PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1343     PetscCall(PetscDrawIsNull(draw, &isnull));
1344     if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1345   }
1346 
1347   { /* assemble the entire matrix onto first processor */
1348     Mat A = NULL, Av;
1349     IS  isrow, iscol;
1350 
1351     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat), rank == 0 ? mat->rmap->N : 0, 0, 1, &isrow));
1352     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat), rank == 0 ? mat->cmap->N : 0, 0, 1, &iscol));
1353     PetscCall(MatCreateSubMatrix(mat, isrow, iscol, MAT_INITIAL_MATRIX, &A));
1354     PetscCall(MatMPIAIJGetSeqAIJ(A, &Av, NULL, NULL));
1355     /*  The commented code uses MatCreateSubMatrices instead */
1356     /*
1357     Mat *AA, A = NULL, Av;
1358     IS  isrow,iscol;
1359 
1360     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat),rank == 0 ? mat->rmap->N : 0,0,1,&isrow));
1361     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat),rank == 0 ? mat->cmap->N : 0,0,1,&iscol));
1362     PetscCall(MatCreateSubMatrices(mat,1,&isrow,&iscol,MAT_INITIAL_MATRIX,&AA));
1363     if (rank == 0) {
1364        PetscCall(PetscObjectReference((PetscObject)AA[0]));
1365        A    = AA[0];
1366        Av   = AA[0];
1367     }
1368     PetscCall(MatDestroySubMatrices(1,&AA));
1369 */
1370     PetscCall(ISDestroy(&iscol));
1371     PetscCall(ISDestroy(&isrow));
1372     /*
1373        Everyone has to call to draw the matrix since the graphics waits are
1374        synchronized across all processors that share the PetscDraw object
1375     */
1376     PetscCall(PetscViewerGetSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
1377     if (rank == 0) {
1378       if (((PetscObject)mat)->name) PetscCall(PetscObjectSetName((PetscObject)Av, ((PetscObject)mat)->name));
1379       PetscCall(MatView_SeqAIJ(Av, sviewer));
1380     }
1381     PetscCall(PetscViewerRestoreSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
1382     PetscCall(MatDestroy(&A));
1383   }
1384   PetscFunctionReturn(PETSC_SUCCESS);
1385 }
1386 
1387 PetscErrorCode MatView_MPIAIJ(Mat mat, PetscViewer viewer)
1388 {
1389   PetscBool iascii, isdraw, issocket, isbinary;
1390 
1391   PetscFunctionBegin;
1392   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1393   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1394   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1395   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERSOCKET, &issocket));
1396   if (iascii || isdraw || isbinary || issocket) PetscCall(MatView_MPIAIJ_ASCIIorDraworSocket(mat, viewer));
1397   PetscFunctionReturn(PETSC_SUCCESS);
1398 }
1399 
1400 static PetscErrorCode MatSOR_MPIAIJ(Mat matin, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1401 {
1402   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)matin->data;
1403   Vec         bb1 = NULL;
1404   PetscBool   hasop;
1405 
1406   PetscFunctionBegin;
1407   if (flag == SOR_APPLY_UPPER) {
1408     PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1409     PetscFunctionReturn(PETSC_SUCCESS);
1410   }
1411 
1412   if (its > 1 || ~flag & SOR_ZERO_INITIAL_GUESS || flag & SOR_EISENSTAT) PetscCall(VecDuplicate(bb, &bb1));
1413 
1414   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP) {
1415     if (flag & SOR_ZERO_INITIAL_GUESS) {
1416       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1417       its--;
1418     }
1419 
1420     while (its--) {
1421       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1422       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1423 
1424       /* update rhs: bb1 = bb - B*x */
1425       PetscCall(VecScale(mat->lvec, -1.0));
1426       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1427 
1428       /* local sweep */
1429       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_SYMMETRIC_SWEEP, fshift, lits, 1, xx));
1430     }
1431   } else if (flag & SOR_LOCAL_FORWARD_SWEEP) {
1432     if (flag & SOR_ZERO_INITIAL_GUESS) {
1433       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1434       its--;
1435     }
1436     while (its--) {
1437       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1438       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1439 
1440       /* update rhs: bb1 = bb - B*x */
1441       PetscCall(VecScale(mat->lvec, -1.0));
1442       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1443 
1444       /* local sweep */
1445       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_FORWARD_SWEEP, fshift, lits, 1, xx));
1446     }
1447   } else if (flag & SOR_LOCAL_BACKWARD_SWEEP) {
1448     if (flag & SOR_ZERO_INITIAL_GUESS) {
1449       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1450       its--;
1451     }
1452     while (its--) {
1453       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1454       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1455 
1456       /* update rhs: bb1 = bb - B*x */
1457       PetscCall(VecScale(mat->lvec, -1.0));
1458       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1459 
1460       /* local sweep */
1461       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_BACKWARD_SWEEP, fshift, lits, 1, xx));
1462     }
1463   } else if (flag & SOR_EISENSTAT) {
1464     Vec xx1;
1465 
1466     PetscCall(VecDuplicate(bb, &xx1));
1467     PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, (MatSORType)(SOR_ZERO_INITIAL_GUESS | SOR_LOCAL_BACKWARD_SWEEP), fshift, lits, 1, xx));
1468 
1469     PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1470     PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1471     if (!mat->diag) {
1472       PetscCall(MatCreateVecs(matin, &mat->diag, NULL));
1473       PetscCall(MatGetDiagonal(matin, mat->diag));
1474     }
1475     PetscCall(MatHasOperation(matin, MATOP_MULT_DIAGONAL_BLOCK, &hasop));
1476     if (hasop) {
1477       PetscCall(MatMultDiagonalBlock(matin, xx, bb1));
1478     } else {
1479       PetscCall(VecPointwiseMult(bb1, mat->diag, xx));
1480     }
1481     PetscCall(VecAYPX(bb1, (omega - 2.0) / omega, bb));
1482 
1483     PetscCall(MatMultAdd(mat->B, mat->lvec, bb1, bb1));
1484 
1485     /* local sweep */
1486     PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, (MatSORType)(SOR_ZERO_INITIAL_GUESS | SOR_LOCAL_FORWARD_SWEEP), fshift, lits, 1, xx1));
1487     PetscCall(VecAXPY(xx, 1.0, xx1));
1488     PetscCall(VecDestroy(&xx1));
1489   } else SETERRQ(PetscObjectComm((PetscObject)matin), PETSC_ERR_SUP, "Parallel SOR not supported");
1490 
1491   PetscCall(VecDestroy(&bb1));
1492 
1493   matin->factorerrortype = mat->A->factorerrortype;
1494   PetscFunctionReturn(PETSC_SUCCESS);
1495 }
1496 
1497 static PetscErrorCode MatPermute_MPIAIJ(Mat A, IS rowp, IS colp, Mat *B)
1498 {
1499   Mat             aA, aB, Aperm;
1500   const PetscInt *rwant, *cwant, *gcols, *ai, *bi, *aj, *bj;
1501   PetscScalar    *aa, *ba;
1502   PetscInt        i, j, m, n, ng, anz, bnz, *dnnz, *onnz, *tdnnz, *tonnz, *rdest, *cdest, *work, *gcdest;
1503   PetscSF         rowsf, sf;
1504   IS              parcolp = NULL;
1505   PetscBool       done;
1506 
1507   PetscFunctionBegin;
1508   PetscCall(MatGetLocalSize(A, &m, &n));
1509   PetscCall(ISGetIndices(rowp, &rwant));
1510   PetscCall(ISGetIndices(colp, &cwant));
1511   PetscCall(PetscMalloc3(PetscMax(m, n), &work, m, &rdest, n, &cdest));
1512 
1513   /* Invert row permutation to find out where my rows should go */
1514   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &rowsf));
1515   PetscCall(PetscSFSetGraphLayout(rowsf, A->rmap, A->rmap->n, NULL, PETSC_OWN_POINTER, rwant));
1516   PetscCall(PetscSFSetFromOptions(rowsf));
1517   for (i = 0; i < m; i++) work[i] = A->rmap->rstart + i;
1518   PetscCall(PetscSFReduceBegin(rowsf, MPIU_INT, work, rdest, MPI_REPLACE));
1519   PetscCall(PetscSFReduceEnd(rowsf, MPIU_INT, work, rdest, MPI_REPLACE));
1520 
1521   /* Invert column permutation to find out where my columns should go */
1522   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1523   PetscCall(PetscSFSetGraphLayout(sf, A->cmap, A->cmap->n, NULL, PETSC_OWN_POINTER, cwant));
1524   PetscCall(PetscSFSetFromOptions(sf));
1525   for (i = 0; i < n; i++) work[i] = A->cmap->rstart + i;
1526   PetscCall(PetscSFReduceBegin(sf, MPIU_INT, work, cdest, MPI_REPLACE));
1527   PetscCall(PetscSFReduceEnd(sf, MPIU_INT, work, cdest, MPI_REPLACE));
1528   PetscCall(PetscSFDestroy(&sf));
1529 
1530   PetscCall(ISRestoreIndices(rowp, &rwant));
1531   PetscCall(ISRestoreIndices(colp, &cwant));
1532   PetscCall(MatMPIAIJGetSeqAIJ(A, &aA, &aB, &gcols));
1533 
1534   /* Find out where my gcols should go */
1535   PetscCall(MatGetSize(aB, NULL, &ng));
1536   PetscCall(PetscMalloc1(ng, &gcdest));
1537   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1538   PetscCall(PetscSFSetGraphLayout(sf, A->cmap, ng, NULL, PETSC_OWN_POINTER, gcols));
1539   PetscCall(PetscSFSetFromOptions(sf));
1540   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, cdest, gcdest, MPI_REPLACE));
1541   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, cdest, gcdest, MPI_REPLACE));
1542   PetscCall(PetscSFDestroy(&sf));
1543 
1544   PetscCall(PetscCalloc4(m, &dnnz, m, &onnz, m, &tdnnz, m, &tonnz));
1545   PetscCall(MatGetRowIJ(aA, 0, PETSC_FALSE, PETSC_FALSE, &anz, &ai, &aj, &done));
1546   PetscCall(MatGetRowIJ(aB, 0, PETSC_FALSE, PETSC_FALSE, &bnz, &bi, &bj, &done));
1547   for (i = 0; i < m; i++) {
1548     PetscInt    row = rdest[i];
1549     PetscMPIInt rowner;
1550     PetscCall(PetscLayoutFindOwner(A->rmap, row, &rowner));
1551     for (j = ai[i]; j < ai[i + 1]; j++) {
1552       PetscInt    col = cdest[aj[j]];
1553       PetscMPIInt cowner;
1554       PetscCall(PetscLayoutFindOwner(A->cmap, col, &cowner)); /* Could build an index for the columns to eliminate this search */
1555       if (rowner == cowner) dnnz[i]++;
1556       else onnz[i]++;
1557     }
1558     for (j = bi[i]; j < bi[i + 1]; j++) {
1559       PetscInt    col = gcdest[bj[j]];
1560       PetscMPIInt cowner;
1561       PetscCall(PetscLayoutFindOwner(A->cmap, col, &cowner));
1562       if (rowner == cowner) dnnz[i]++;
1563       else onnz[i]++;
1564     }
1565   }
1566   PetscCall(PetscSFBcastBegin(rowsf, MPIU_INT, dnnz, tdnnz, MPI_REPLACE));
1567   PetscCall(PetscSFBcastEnd(rowsf, MPIU_INT, dnnz, tdnnz, MPI_REPLACE));
1568   PetscCall(PetscSFBcastBegin(rowsf, MPIU_INT, onnz, tonnz, MPI_REPLACE));
1569   PetscCall(PetscSFBcastEnd(rowsf, MPIU_INT, onnz, tonnz, MPI_REPLACE));
1570   PetscCall(PetscSFDestroy(&rowsf));
1571 
1572   PetscCall(MatCreateAIJ(PetscObjectComm((PetscObject)A), A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N, 0, tdnnz, 0, tonnz, &Aperm));
1573   PetscCall(MatSeqAIJGetArray(aA, &aa));
1574   PetscCall(MatSeqAIJGetArray(aB, &ba));
1575   for (i = 0; i < m; i++) {
1576     PetscInt *acols = dnnz, *bcols = onnz; /* Repurpose now-unneeded arrays */
1577     PetscInt  j0, rowlen;
1578     rowlen = ai[i + 1] - ai[i];
1579     for (j0 = j = 0; j < rowlen; j0 = j) { /* rowlen could be larger than number of rows m, so sum in batches */
1580       for (; j < PetscMin(rowlen, j0 + m); j++) acols[j - j0] = cdest[aj[ai[i] + j]];
1581       PetscCall(MatSetValues(Aperm, 1, &rdest[i], j - j0, acols, aa + ai[i] + j0, INSERT_VALUES));
1582     }
1583     rowlen = bi[i + 1] - bi[i];
1584     for (j0 = j = 0; j < rowlen; j0 = j) {
1585       for (; j < PetscMin(rowlen, j0 + m); j++) bcols[j - j0] = gcdest[bj[bi[i] + j]];
1586       PetscCall(MatSetValues(Aperm, 1, &rdest[i], j - j0, bcols, ba + bi[i] + j0, INSERT_VALUES));
1587     }
1588   }
1589   PetscCall(MatAssemblyBegin(Aperm, MAT_FINAL_ASSEMBLY));
1590   PetscCall(MatAssemblyEnd(Aperm, MAT_FINAL_ASSEMBLY));
1591   PetscCall(MatRestoreRowIJ(aA, 0, PETSC_FALSE, PETSC_FALSE, &anz, &ai, &aj, &done));
1592   PetscCall(MatRestoreRowIJ(aB, 0, PETSC_FALSE, PETSC_FALSE, &bnz, &bi, &bj, &done));
1593   PetscCall(MatSeqAIJRestoreArray(aA, &aa));
1594   PetscCall(MatSeqAIJRestoreArray(aB, &ba));
1595   PetscCall(PetscFree4(dnnz, onnz, tdnnz, tonnz));
1596   PetscCall(PetscFree3(work, rdest, cdest));
1597   PetscCall(PetscFree(gcdest));
1598   if (parcolp) PetscCall(ISDestroy(&colp));
1599   *B = Aperm;
1600   PetscFunctionReturn(PETSC_SUCCESS);
1601 }
1602 
1603 static PetscErrorCode MatGetGhosts_MPIAIJ(Mat mat, PetscInt *nghosts, const PetscInt *ghosts[])
1604 {
1605   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1606 
1607   PetscFunctionBegin;
1608   PetscCall(MatGetSize(aij->B, NULL, nghosts));
1609   if (ghosts) *ghosts = aij->garray;
1610   PetscFunctionReturn(PETSC_SUCCESS);
1611 }
1612 
1613 static PetscErrorCode MatGetInfo_MPIAIJ(Mat matin, MatInfoType flag, MatInfo *info)
1614 {
1615   Mat_MPIAIJ    *mat = (Mat_MPIAIJ *)matin->data;
1616   Mat            A = mat->A, B = mat->B;
1617   PetscLogDouble isend[5], irecv[5];
1618 
1619   PetscFunctionBegin;
1620   info->block_size = 1.0;
1621   PetscCall(MatGetInfo(A, MAT_LOCAL, info));
1622 
1623   isend[0] = info->nz_used;
1624   isend[1] = info->nz_allocated;
1625   isend[2] = info->nz_unneeded;
1626   isend[3] = info->memory;
1627   isend[4] = info->mallocs;
1628 
1629   PetscCall(MatGetInfo(B, MAT_LOCAL, info));
1630 
1631   isend[0] += info->nz_used;
1632   isend[1] += info->nz_allocated;
1633   isend[2] += info->nz_unneeded;
1634   isend[3] += info->memory;
1635   isend[4] += info->mallocs;
1636   if (flag == MAT_LOCAL) {
1637     info->nz_used      = isend[0];
1638     info->nz_allocated = isend[1];
1639     info->nz_unneeded  = isend[2];
1640     info->memory       = isend[3];
1641     info->mallocs      = isend[4];
1642   } else if (flag == MAT_GLOBAL_MAX) {
1643     PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_MAX, PetscObjectComm((PetscObject)matin)));
1644 
1645     info->nz_used      = irecv[0];
1646     info->nz_allocated = irecv[1];
1647     info->nz_unneeded  = irecv[2];
1648     info->memory       = irecv[3];
1649     info->mallocs      = irecv[4];
1650   } else if (flag == MAT_GLOBAL_SUM) {
1651     PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_SUM, PetscObjectComm((PetscObject)matin)));
1652 
1653     info->nz_used      = irecv[0];
1654     info->nz_allocated = irecv[1];
1655     info->nz_unneeded  = irecv[2];
1656     info->memory       = irecv[3];
1657     info->mallocs      = irecv[4];
1658   }
1659   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1660   info->fill_ratio_needed = 0;
1661   info->factor_mallocs    = 0;
1662   PetscFunctionReturn(PETSC_SUCCESS);
1663 }
1664 
1665 PetscErrorCode MatSetOption_MPIAIJ(Mat A, MatOption op, PetscBool flg)
1666 {
1667   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1668 
1669   PetscFunctionBegin;
1670   switch (op) {
1671   case MAT_NEW_NONZERO_LOCATIONS:
1672   case MAT_NEW_NONZERO_ALLOCATION_ERR:
1673   case MAT_UNUSED_NONZERO_LOCATION_ERR:
1674   case MAT_KEEP_NONZERO_PATTERN:
1675   case MAT_NEW_NONZERO_LOCATION_ERR:
1676   case MAT_USE_INODES:
1677   case MAT_IGNORE_ZERO_ENTRIES:
1678   case MAT_FORM_EXPLICIT_TRANSPOSE:
1679     MatCheckPreallocated(A, 1);
1680     PetscCall(MatSetOption(a->A, op, flg));
1681     PetscCall(MatSetOption(a->B, op, flg));
1682     break;
1683   case MAT_ROW_ORIENTED:
1684     MatCheckPreallocated(A, 1);
1685     a->roworiented = flg;
1686 
1687     PetscCall(MatSetOption(a->A, op, flg));
1688     PetscCall(MatSetOption(a->B, op, flg));
1689     break;
1690   case MAT_IGNORE_OFF_PROC_ENTRIES:
1691     a->donotstash = flg;
1692     break;
1693   /* Symmetry flags are handled directly by MatSetOption() and they don't affect preallocation */
1694   case MAT_SPD:
1695   case MAT_SYMMETRIC:
1696   case MAT_STRUCTURALLY_SYMMETRIC:
1697   case MAT_HERMITIAN:
1698   case MAT_SYMMETRY_ETERNAL:
1699   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
1700   case MAT_SPD_ETERNAL:
1701     /* if the diagonal matrix is square it inherits some of the properties above */
1702     if (a->A && A->rmap->n == A->cmap->n) PetscCall(MatSetOption(a->A, op, flg));
1703     break;
1704   case MAT_SUBMAT_SINGLEIS:
1705     A->submat_singleis = flg;
1706     break;
1707   default:
1708     break;
1709   }
1710   PetscFunctionReturn(PETSC_SUCCESS);
1711 }
1712 
1713 PetscErrorCode MatGetRow_MPIAIJ(Mat matin, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1714 {
1715   Mat_MPIAIJ  *mat = (Mat_MPIAIJ *)matin->data;
1716   PetscScalar *vworkA, *vworkB, **pvA, **pvB, *v_p;
1717   PetscInt     i, *cworkA, *cworkB, **pcA, **pcB, cstart = matin->cmap->rstart;
1718   PetscInt     nztot, nzA, nzB, lrow, rstart = matin->rmap->rstart, rend = matin->rmap->rend;
1719   PetscInt    *cmap, *idx_p;
1720 
1721   PetscFunctionBegin;
1722   PetscCheck(!mat->getrowactive, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Already active");
1723   mat->getrowactive = PETSC_TRUE;
1724 
1725   if (!mat->rowvalues && (idx || v)) {
1726     /*
1727         allocate enough space to hold information from the longest row.
1728     */
1729     Mat_SeqAIJ *Aa = (Mat_SeqAIJ *)mat->A->data, *Ba = (Mat_SeqAIJ *)mat->B->data;
1730     PetscInt    max = 1, tmp;
1731     for (i = 0; i < matin->rmap->n; i++) {
1732       tmp = Aa->i[i + 1] - Aa->i[i] + Ba->i[i + 1] - Ba->i[i];
1733       if (max < tmp) max = tmp;
1734     }
1735     PetscCall(PetscMalloc2(max, &mat->rowvalues, max, &mat->rowindices));
1736   }
1737 
1738   PetscCheck(row >= rstart && row < rend, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Only local rows");
1739   lrow = row - rstart;
1740 
1741   pvA = &vworkA;
1742   pcA = &cworkA;
1743   pvB = &vworkB;
1744   pcB = &cworkB;
1745   if (!v) {
1746     pvA = NULL;
1747     pvB = NULL;
1748   }
1749   if (!idx) {
1750     pcA = NULL;
1751     if (!v) pcB = NULL;
1752   }
1753   PetscCall((*mat->A->ops->getrow)(mat->A, lrow, &nzA, pcA, pvA));
1754   PetscCall((*mat->B->ops->getrow)(mat->B, lrow, &nzB, pcB, pvB));
1755   nztot = nzA + nzB;
1756 
1757   cmap = mat->garray;
1758   if (v || idx) {
1759     if (nztot) {
1760       /* Sort by increasing column numbers, assuming A and B already sorted */
1761       PetscInt imark = -1;
1762       if (v) {
1763         *v = v_p = mat->rowvalues;
1764         for (i = 0; i < nzB; i++) {
1765           if (cmap[cworkB[i]] < cstart) v_p[i] = vworkB[i];
1766           else break;
1767         }
1768         imark = i;
1769         for (i = 0; i < nzA; i++) v_p[imark + i] = vworkA[i];
1770         for (i = imark; i < nzB; i++) v_p[nzA + i] = vworkB[i];
1771       }
1772       if (idx) {
1773         *idx = idx_p = mat->rowindices;
1774         if (imark > -1) {
1775           for (i = 0; i < imark; i++) idx_p[i] = cmap[cworkB[i]];
1776         } else {
1777           for (i = 0; i < nzB; i++) {
1778             if (cmap[cworkB[i]] < cstart) idx_p[i] = cmap[cworkB[i]];
1779             else break;
1780           }
1781           imark = i;
1782         }
1783         for (i = 0; i < nzA; i++) idx_p[imark + i] = cstart + cworkA[i];
1784         for (i = imark; i < nzB; i++) idx_p[nzA + i] = cmap[cworkB[i]];
1785       }
1786     } else {
1787       if (idx) *idx = NULL;
1788       if (v) *v = NULL;
1789     }
1790   }
1791   *nz = nztot;
1792   PetscCall((*mat->A->ops->restorerow)(mat->A, lrow, &nzA, pcA, pvA));
1793   PetscCall((*mat->B->ops->restorerow)(mat->B, lrow, &nzB, pcB, pvB));
1794   PetscFunctionReturn(PETSC_SUCCESS);
1795 }
1796 
1797 PetscErrorCode MatRestoreRow_MPIAIJ(Mat mat, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1798 {
1799   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1800 
1801   PetscFunctionBegin;
1802   PetscCheck(aij->getrowactive, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "MatGetRow() must be called first");
1803   aij->getrowactive = PETSC_FALSE;
1804   PetscFunctionReturn(PETSC_SUCCESS);
1805 }
1806 
1807 static PetscErrorCode MatNorm_MPIAIJ(Mat mat, NormType type, PetscReal *norm)
1808 {
1809   Mat_MPIAIJ      *aij  = (Mat_MPIAIJ *)mat->data;
1810   Mat_SeqAIJ      *amat = (Mat_SeqAIJ *)aij->A->data, *bmat = (Mat_SeqAIJ *)aij->B->data;
1811   PetscInt         i, j, cstart = mat->cmap->rstart;
1812   PetscReal        sum = 0.0;
1813   const MatScalar *v, *amata, *bmata;
1814 
1815   PetscFunctionBegin;
1816   if (aij->size == 1) {
1817     PetscCall(MatNorm(aij->A, type, norm));
1818   } else {
1819     PetscCall(MatSeqAIJGetArrayRead(aij->A, &amata));
1820     PetscCall(MatSeqAIJGetArrayRead(aij->B, &bmata));
1821     if (type == NORM_FROBENIUS) {
1822       v = amata;
1823       for (i = 0; i < amat->nz; i++) {
1824         sum += PetscRealPart(PetscConj(*v) * (*v));
1825         v++;
1826       }
1827       v = bmata;
1828       for (i = 0; i < bmat->nz; i++) {
1829         sum += PetscRealPart(PetscConj(*v) * (*v));
1830         v++;
1831       }
1832       PetscCallMPI(MPIU_Allreduce(&sum, norm, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)mat)));
1833       *norm = PetscSqrtReal(*norm);
1834       PetscCall(PetscLogFlops(2.0 * amat->nz + 2.0 * bmat->nz));
1835     } else if (type == NORM_1) { /* max column norm */
1836       PetscReal *tmp;
1837       PetscInt  *jj, *garray = aij->garray;
1838       PetscCall(PetscCalloc1(mat->cmap->N + 1, &tmp));
1839       *norm = 0.0;
1840       v     = amata;
1841       jj    = amat->j;
1842       for (j = 0; j < amat->nz; j++) {
1843         tmp[cstart + *jj++] += PetscAbsScalar(*v);
1844         v++;
1845       }
1846       v  = bmata;
1847       jj = bmat->j;
1848       for (j = 0; j < bmat->nz; j++) {
1849         tmp[garray[*jj++]] += PetscAbsScalar(*v);
1850         v++;
1851       }
1852       PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, tmp, mat->cmap->N, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)mat)));
1853       for (j = 0; j < mat->cmap->N; j++) {
1854         if (tmp[j] > *norm) *norm = tmp[j];
1855       }
1856       PetscCall(PetscFree(tmp));
1857       PetscCall(PetscLogFlops(PetscMax(amat->nz + bmat->nz - 1, 0)));
1858     } else if (type == NORM_INFINITY) { /* max row norm */
1859       PetscReal ntemp = 0.0;
1860       for (j = 0; j < aij->A->rmap->n; j++) {
1861         v   = PetscSafePointerPlusOffset(amata, amat->i[j]);
1862         sum = 0.0;
1863         for (i = 0; i < amat->i[j + 1] - amat->i[j]; i++) {
1864           sum += PetscAbsScalar(*v);
1865           v++;
1866         }
1867         v = PetscSafePointerPlusOffset(bmata, bmat->i[j]);
1868         for (i = 0; i < bmat->i[j + 1] - bmat->i[j]; i++) {
1869           sum += PetscAbsScalar(*v);
1870           v++;
1871         }
1872         if (sum > ntemp) ntemp = sum;
1873       }
1874       PetscCallMPI(MPIU_Allreduce(&ntemp, norm, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)mat)));
1875       PetscCall(PetscLogFlops(PetscMax(amat->nz + bmat->nz - 1, 0)));
1876     } else SETERRQ(PetscObjectComm((PetscObject)mat), PETSC_ERR_SUP, "No support for two norm");
1877     PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &amata));
1878     PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &bmata));
1879   }
1880   PetscFunctionReturn(PETSC_SUCCESS);
1881 }
1882 
1883 static PetscErrorCode MatTranspose_MPIAIJ(Mat A, MatReuse reuse, Mat *matout)
1884 {
1885   Mat_MPIAIJ      *a    = (Mat_MPIAIJ *)A->data, *b;
1886   Mat_SeqAIJ      *Aloc = (Mat_SeqAIJ *)a->A->data, *Bloc = (Mat_SeqAIJ *)a->B->data, *sub_B_diag;
1887   PetscInt         M = A->rmap->N, N = A->cmap->N, ma, na, mb, nb, row, *cols, *cols_tmp, *B_diag_ilen, i, ncol, A_diag_ncol;
1888   const PetscInt  *ai, *aj, *bi, *bj, *B_diag_i;
1889   Mat              B, A_diag, *B_diag;
1890   const MatScalar *pbv, *bv;
1891 
1892   PetscFunctionBegin;
1893   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *matout));
1894   ma = A->rmap->n;
1895   na = A->cmap->n;
1896   mb = a->B->rmap->n;
1897   nb = a->B->cmap->n;
1898   ai = Aloc->i;
1899   aj = Aloc->j;
1900   bi = Bloc->i;
1901   bj = Bloc->j;
1902   if (reuse == MAT_INITIAL_MATRIX || *matout == A) {
1903     PetscInt            *d_nnz, *g_nnz, *o_nnz;
1904     PetscSFNode         *oloc;
1905     PETSC_UNUSED PetscSF sf;
1906 
1907     PetscCall(PetscMalloc4(na, &d_nnz, na, &o_nnz, nb, &g_nnz, nb, &oloc));
1908     /* compute d_nnz for preallocation */
1909     PetscCall(PetscArrayzero(d_nnz, na));
1910     for (i = 0; i < ai[ma]; i++) d_nnz[aj[i]]++;
1911     /* compute local off-diagonal contributions */
1912     PetscCall(PetscArrayzero(g_nnz, nb));
1913     for (i = 0; i < bi[ma]; i++) g_nnz[bj[i]]++;
1914     /* map those to global */
1915     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1916     PetscCall(PetscSFSetGraphLayout(sf, A->cmap, nb, NULL, PETSC_USE_POINTER, a->garray));
1917     PetscCall(PetscSFSetFromOptions(sf));
1918     PetscCall(PetscArrayzero(o_nnz, na));
1919     PetscCall(PetscSFReduceBegin(sf, MPIU_INT, g_nnz, o_nnz, MPI_SUM));
1920     PetscCall(PetscSFReduceEnd(sf, MPIU_INT, g_nnz, o_nnz, MPI_SUM));
1921     PetscCall(PetscSFDestroy(&sf));
1922 
1923     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
1924     PetscCall(MatSetSizes(B, A->cmap->n, A->rmap->n, N, M));
1925     PetscCall(MatSetBlockSizes(B, A->cmap->bs, A->rmap->bs));
1926     PetscCall(MatSetType(B, ((PetscObject)A)->type_name));
1927     PetscCall(MatMPIAIJSetPreallocation(B, 0, d_nnz, 0, o_nnz));
1928     PetscCall(PetscFree4(d_nnz, o_nnz, g_nnz, oloc));
1929   } else {
1930     B = *matout;
1931     PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
1932   }
1933 
1934   b           = (Mat_MPIAIJ *)B->data;
1935   A_diag      = a->A;
1936   B_diag      = &b->A;
1937   sub_B_diag  = (Mat_SeqAIJ *)(*B_diag)->data;
1938   A_diag_ncol = A_diag->cmap->N;
1939   B_diag_ilen = sub_B_diag->ilen;
1940   B_diag_i    = sub_B_diag->i;
1941 
1942   /* Set ilen for diagonal of B */
1943   for (i = 0; i < A_diag_ncol; i++) B_diag_ilen[i] = B_diag_i[i + 1] - B_diag_i[i];
1944 
1945   /* Transpose the diagonal part of the matrix. In contrast to the off-diagonal part, this can be done
1946   very quickly (=without using MatSetValues), because all writes are local. */
1947   PetscCall(MatTransposeSetPrecursor(A_diag, *B_diag));
1948   PetscCall(MatTranspose(A_diag, MAT_REUSE_MATRIX, B_diag));
1949 
1950   /* copy over the B part */
1951   PetscCall(PetscMalloc1(bi[mb], &cols));
1952   PetscCall(MatSeqAIJGetArrayRead(a->B, &bv));
1953   pbv = bv;
1954   row = A->rmap->rstart;
1955   for (i = 0; i < bi[mb]; i++) cols[i] = a->garray[bj[i]];
1956   cols_tmp = cols;
1957   for (i = 0; i < mb; i++) {
1958     ncol = bi[i + 1] - bi[i];
1959     PetscCall(MatSetValues(B, ncol, cols_tmp, 1, &row, pbv, INSERT_VALUES));
1960     row++;
1961     if (pbv) pbv += ncol;
1962     if (cols_tmp) cols_tmp += ncol;
1963   }
1964   PetscCall(PetscFree(cols));
1965   PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bv));
1966 
1967   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1968   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1969   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) {
1970     *matout = B;
1971   } else {
1972     PetscCall(MatHeaderMerge(A, &B));
1973   }
1974   PetscFunctionReturn(PETSC_SUCCESS);
1975 }
1976 
1977 static PetscErrorCode MatDiagonalScale_MPIAIJ(Mat mat, Vec ll, Vec rr)
1978 {
1979   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1980   Mat         a = aij->A, b = aij->B;
1981   PetscInt    s1, s2, s3;
1982 
1983   PetscFunctionBegin;
1984   PetscCall(MatGetLocalSize(mat, &s2, &s3));
1985   if (rr) {
1986     PetscCall(VecGetLocalSize(rr, &s1));
1987     PetscCheck(s1 == s3, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "right vector non-conforming local size");
1988     /* Overlap communication with computation. */
1989     PetscCall(VecScatterBegin(aij->Mvctx, rr, aij->lvec, INSERT_VALUES, SCATTER_FORWARD));
1990   }
1991   if (ll) {
1992     PetscCall(VecGetLocalSize(ll, &s1));
1993     PetscCheck(s1 == s2, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "left vector non-conforming local size");
1994     PetscUseTypeMethod(b, diagonalscale, ll, NULL);
1995   }
1996   /* scale  the diagonal block */
1997   PetscUseTypeMethod(a, diagonalscale, ll, rr);
1998 
1999   if (rr) {
2000     /* Do a scatter end and then right scale the off-diagonal block */
2001     PetscCall(VecScatterEnd(aij->Mvctx, rr, aij->lvec, INSERT_VALUES, SCATTER_FORWARD));
2002     PetscUseTypeMethod(b, diagonalscale, NULL, aij->lvec);
2003   }
2004   PetscFunctionReturn(PETSC_SUCCESS);
2005 }
2006 
2007 static PetscErrorCode MatSetUnfactored_MPIAIJ(Mat A)
2008 {
2009   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2010 
2011   PetscFunctionBegin;
2012   PetscCall(MatSetUnfactored(a->A));
2013   PetscFunctionReturn(PETSC_SUCCESS);
2014 }
2015 
2016 static PetscErrorCode MatEqual_MPIAIJ(Mat A, Mat B, PetscBool *flag)
2017 {
2018   Mat_MPIAIJ *matB = (Mat_MPIAIJ *)B->data, *matA = (Mat_MPIAIJ *)A->data;
2019   Mat         a, b, c, d;
2020   PetscBool   flg;
2021 
2022   PetscFunctionBegin;
2023   a = matA->A;
2024   b = matA->B;
2025   c = matB->A;
2026   d = matB->B;
2027 
2028   PetscCall(MatEqual(a, c, &flg));
2029   if (flg) PetscCall(MatEqual(b, d, &flg));
2030   PetscCallMPI(MPIU_Allreduce(&flg, flag, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)A)));
2031   PetscFunctionReturn(PETSC_SUCCESS);
2032 }
2033 
2034 static PetscErrorCode MatCopy_MPIAIJ(Mat A, Mat B, MatStructure str)
2035 {
2036   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2037   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2038 
2039   PetscFunctionBegin;
2040   /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
2041   if ((str != SAME_NONZERO_PATTERN) || (A->ops->copy != B->ops->copy)) {
2042     /* because of the column compression in the off-processor part of the matrix a->B,
2043        the number of columns in a->B and b->B may be different, hence we cannot call
2044        the MatCopy() directly on the two parts. If need be, we can provide a more
2045        efficient copy than the MatCopy_Basic() by first uncompressing the a->B matrices
2046        then copying the submatrices */
2047     PetscCall(MatCopy_Basic(A, B, str));
2048   } else {
2049     PetscCall(MatCopy(a->A, b->A, str));
2050     PetscCall(MatCopy(a->B, b->B, str));
2051   }
2052   PetscCall(PetscObjectStateIncrease((PetscObject)B));
2053   PetscFunctionReturn(PETSC_SUCCESS);
2054 }
2055 
2056 /*
2057    Computes the number of nonzeros per row needed for preallocation when X and Y
2058    have different nonzero structure.
2059 */
2060 PetscErrorCode MatAXPYGetPreallocation_MPIX_private(PetscInt m, const PetscInt *xi, const PetscInt *xj, const PetscInt *xltog, const PetscInt *yi, const PetscInt *yj, const PetscInt *yltog, PetscInt *nnz)
2061 {
2062   PetscInt i, j, k, nzx, nzy;
2063 
2064   PetscFunctionBegin;
2065   /* Set the number of nonzeros in the new matrix */
2066   for (i = 0; i < m; i++) {
2067     const PetscInt *xjj = PetscSafePointerPlusOffset(xj, xi[i]), *yjj = PetscSafePointerPlusOffset(yj, yi[i]);
2068     nzx    = xi[i + 1] - xi[i];
2069     nzy    = yi[i + 1] - yi[i];
2070     nnz[i] = 0;
2071     for (j = 0, k = 0; j < nzx; j++) {                                /* Point in X */
2072       for (; k < nzy && yltog[yjj[k]] < xltog[xjj[j]]; k++) nnz[i]++; /* Catch up to X */
2073       if (k < nzy && yltog[yjj[k]] == xltog[xjj[j]]) k++;             /* Skip duplicate */
2074       nnz[i]++;
2075     }
2076     for (; k < nzy; k++) nnz[i]++;
2077   }
2078   PetscFunctionReturn(PETSC_SUCCESS);
2079 }
2080 
2081 /* This is the same as MatAXPYGetPreallocation_SeqAIJ, except that the local-to-global map is provided */
2082 static PetscErrorCode MatAXPYGetPreallocation_MPIAIJ(Mat Y, const PetscInt *yltog, Mat X, const PetscInt *xltog, PetscInt *nnz)
2083 {
2084   PetscInt    m = Y->rmap->N;
2085   Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data;
2086   Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
2087 
2088   PetscFunctionBegin;
2089   PetscCall(MatAXPYGetPreallocation_MPIX_private(m, x->i, x->j, xltog, y->i, y->j, yltog, nnz));
2090   PetscFunctionReturn(PETSC_SUCCESS);
2091 }
2092 
2093 static PetscErrorCode MatAXPY_MPIAIJ(Mat Y, PetscScalar a, Mat X, MatStructure str)
2094 {
2095   Mat_MPIAIJ *xx = (Mat_MPIAIJ *)X->data, *yy = (Mat_MPIAIJ *)Y->data;
2096 
2097   PetscFunctionBegin;
2098   if (str == SAME_NONZERO_PATTERN) {
2099     PetscCall(MatAXPY(yy->A, a, xx->A, str));
2100     PetscCall(MatAXPY(yy->B, a, xx->B, str));
2101   } else if (str == SUBSET_NONZERO_PATTERN) { /* nonzeros of X is a subset of Y's */
2102     PetscCall(MatAXPY_Basic(Y, a, X, str));
2103   } else {
2104     Mat       B;
2105     PetscInt *nnz_d, *nnz_o;
2106 
2107     PetscCall(PetscMalloc1(yy->A->rmap->N, &nnz_d));
2108     PetscCall(PetscMalloc1(yy->B->rmap->N, &nnz_o));
2109     PetscCall(MatCreate(PetscObjectComm((PetscObject)Y), &B));
2110     PetscCall(PetscObjectSetName((PetscObject)B, ((PetscObject)Y)->name));
2111     PetscCall(MatSetLayouts(B, Y->rmap, Y->cmap));
2112     PetscCall(MatSetType(B, ((PetscObject)Y)->type_name));
2113     PetscCall(MatAXPYGetPreallocation_SeqAIJ(yy->A, xx->A, nnz_d));
2114     PetscCall(MatAXPYGetPreallocation_MPIAIJ(yy->B, yy->garray, xx->B, xx->garray, nnz_o));
2115     PetscCall(MatMPIAIJSetPreallocation(B, 0, nnz_d, 0, nnz_o));
2116     PetscCall(MatAXPY_BasicWithPreallocation(B, Y, a, X, str));
2117     PetscCall(MatHeaderMerge(Y, &B));
2118     PetscCall(PetscFree(nnz_d));
2119     PetscCall(PetscFree(nnz_o));
2120   }
2121   PetscFunctionReturn(PETSC_SUCCESS);
2122 }
2123 
2124 PETSC_INTERN PetscErrorCode MatConjugate_SeqAIJ(Mat);
2125 
2126 static PetscErrorCode MatConjugate_MPIAIJ(Mat mat)
2127 {
2128   PetscFunctionBegin;
2129   if (PetscDefined(USE_COMPLEX)) {
2130     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2131 
2132     PetscCall(MatConjugate_SeqAIJ(aij->A));
2133     PetscCall(MatConjugate_SeqAIJ(aij->B));
2134   }
2135   PetscFunctionReturn(PETSC_SUCCESS);
2136 }
2137 
2138 static PetscErrorCode MatRealPart_MPIAIJ(Mat A)
2139 {
2140   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2141 
2142   PetscFunctionBegin;
2143   PetscCall(MatRealPart(a->A));
2144   PetscCall(MatRealPart(a->B));
2145   PetscFunctionReturn(PETSC_SUCCESS);
2146 }
2147 
2148 static PetscErrorCode MatImaginaryPart_MPIAIJ(Mat A)
2149 {
2150   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2151 
2152   PetscFunctionBegin;
2153   PetscCall(MatImaginaryPart(a->A));
2154   PetscCall(MatImaginaryPart(a->B));
2155   PetscFunctionReturn(PETSC_SUCCESS);
2156 }
2157 
2158 static PetscErrorCode MatGetRowMaxAbs_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2159 {
2160   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data;
2161   PetscInt           i, *idxb = NULL, m = A->rmap->n;
2162   PetscScalar       *vv;
2163   Vec                vB, vA;
2164   const PetscScalar *va, *vb;
2165 
2166   PetscFunctionBegin;
2167   PetscCall(MatCreateVecs(a->A, NULL, &vA));
2168   PetscCall(MatGetRowMaxAbs(a->A, vA, idx));
2169 
2170   PetscCall(VecGetArrayRead(vA, &va));
2171   if (idx) {
2172     for (i = 0; i < m; i++) {
2173       if (PetscAbsScalar(va[i])) idx[i] += A->cmap->rstart;
2174     }
2175   }
2176 
2177   PetscCall(MatCreateVecs(a->B, NULL, &vB));
2178   PetscCall(PetscMalloc1(m, &idxb));
2179   PetscCall(MatGetRowMaxAbs(a->B, vB, idxb));
2180 
2181   PetscCall(VecGetArrayWrite(v, &vv));
2182   PetscCall(VecGetArrayRead(vB, &vb));
2183   for (i = 0; i < m; i++) {
2184     if (PetscAbsScalar(va[i]) < PetscAbsScalar(vb[i])) {
2185       vv[i] = vb[i];
2186       if (idx) idx[i] = a->garray[idxb[i]];
2187     } else {
2188       vv[i] = va[i];
2189       if (idx && PetscAbsScalar(va[i]) == PetscAbsScalar(vb[i]) && idxb[i] != -1 && idx[i] > a->garray[idxb[i]]) idx[i] = a->garray[idxb[i]];
2190     }
2191   }
2192   PetscCall(VecRestoreArrayWrite(v, &vv));
2193   PetscCall(VecRestoreArrayRead(vA, &va));
2194   PetscCall(VecRestoreArrayRead(vB, &vb));
2195   PetscCall(PetscFree(idxb));
2196   PetscCall(VecDestroy(&vA));
2197   PetscCall(VecDestroy(&vB));
2198   PetscFunctionReturn(PETSC_SUCCESS);
2199 }
2200 
2201 static PetscErrorCode MatGetRowSumAbs_MPIAIJ(Mat A, Vec v)
2202 {
2203   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2204   Vec         vB, vA;
2205 
2206   PetscFunctionBegin;
2207   PetscCall(MatCreateVecs(a->A, NULL, &vA));
2208   PetscCall(MatGetRowSumAbs(a->A, vA));
2209   PetscCall(MatCreateVecs(a->B, NULL, &vB));
2210   PetscCall(MatGetRowSumAbs(a->B, vB));
2211   PetscCall(VecAXPY(vA, 1.0, vB));
2212   PetscCall(VecDestroy(&vB));
2213   PetscCall(VecCopy(vA, v));
2214   PetscCall(VecDestroy(&vA));
2215   PetscFunctionReturn(PETSC_SUCCESS);
2216 }
2217 
2218 static PetscErrorCode MatGetRowMinAbs_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2219 {
2220   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2221   PetscInt           m = A->rmap->n, n = A->cmap->n;
2222   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2223   PetscInt          *cmap = mat->garray;
2224   PetscInt          *diagIdx, *offdiagIdx;
2225   Vec                diagV, offdiagV;
2226   PetscScalar       *a, *diagA, *offdiagA;
2227   const PetscScalar *ba, *bav;
2228   PetscInt           r, j, col, ncols, *bi, *bj;
2229   Mat                B = mat->B;
2230   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2231 
2232   PetscFunctionBegin;
2233   /* When a process holds entire A and other processes have no entry */
2234   if (A->cmap->N == n) {
2235     PetscCall(VecGetArrayWrite(v, &diagA));
2236     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2237     PetscCall(MatGetRowMinAbs(mat->A, diagV, idx));
2238     PetscCall(VecDestroy(&diagV));
2239     PetscCall(VecRestoreArrayWrite(v, &diagA));
2240     PetscFunctionReturn(PETSC_SUCCESS);
2241   } else if (n == 0) {
2242     if (m) {
2243       PetscCall(VecGetArrayWrite(v, &a));
2244       for (r = 0; r < m; r++) {
2245         a[r] = 0.0;
2246         if (idx) idx[r] = -1;
2247       }
2248       PetscCall(VecRestoreArrayWrite(v, &a));
2249     }
2250     PetscFunctionReturn(PETSC_SUCCESS);
2251   }
2252 
2253   PetscCall(PetscMalloc2(m, &diagIdx, m, &offdiagIdx));
2254   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2255   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2256   PetscCall(MatGetRowMinAbs(mat->A, diagV, diagIdx));
2257 
2258   /* Get offdiagIdx[] for implicit 0.0 */
2259   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2260   ba = bav;
2261   bi = b->i;
2262   bj = b->j;
2263   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2264   for (r = 0; r < m; r++) {
2265     ncols = bi[r + 1] - bi[r];
2266     if (ncols == A->cmap->N - n) { /* Brow is dense */
2267       offdiagA[r]   = *ba;
2268       offdiagIdx[r] = cmap[0];
2269     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2270       offdiagA[r] = 0.0;
2271 
2272       /* Find first hole in the cmap */
2273       for (j = 0; j < ncols; j++) {
2274         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2275         if (col > j && j < cstart) {
2276           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2277           break;
2278         } else if (col > j + n && j >= cstart) {
2279           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2280           break;
2281         }
2282       }
2283       if (j == ncols && ncols < A->cmap->N - n) {
2284         /* a hole is outside compressed Bcols */
2285         if (ncols == 0) {
2286           if (cstart) {
2287             offdiagIdx[r] = 0;
2288           } else offdiagIdx[r] = cend;
2289         } else { /* ncols > 0 */
2290           offdiagIdx[r] = cmap[ncols - 1] + 1;
2291           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2292         }
2293       }
2294     }
2295 
2296     for (j = 0; j < ncols; j++) {
2297       if (PetscAbsScalar(offdiagA[r]) > PetscAbsScalar(*ba)) {
2298         offdiagA[r]   = *ba;
2299         offdiagIdx[r] = cmap[*bj];
2300       }
2301       ba++;
2302       bj++;
2303     }
2304   }
2305 
2306   PetscCall(VecGetArrayWrite(v, &a));
2307   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2308   for (r = 0; r < m; ++r) {
2309     if (PetscAbsScalar(diagA[r]) < PetscAbsScalar(offdiagA[r])) {
2310       a[r] = diagA[r];
2311       if (idx) idx[r] = cstart + diagIdx[r];
2312     } else if (PetscAbsScalar(diagA[r]) == PetscAbsScalar(offdiagA[r])) {
2313       a[r] = diagA[r];
2314       if (idx) {
2315         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2316           idx[r] = cstart + diagIdx[r];
2317         } else idx[r] = offdiagIdx[r];
2318       }
2319     } else {
2320       a[r] = offdiagA[r];
2321       if (idx) idx[r] = offdiagIdx[r];
2322     }
2323   }
2324   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2325   PetscCall(VecRestoreArrayWrite(v, &a));
2326   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2327   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2328   PetscCall(VecDestroy(&diagV));
2329   PetscCall(VecDestroy(&offdiagV));
2330   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2331   PetscFunctionReturn(PETSC_SUCCESS);
2332 }
2333 
2334 static PetscErrorCode MatGetRowMin_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2335 {
2336   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2337   PetscInt           m = A->rmap->n, n = A->cmap->n;
2338   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2339   PetscInt          *cmap = mat->garray;
2340   PetscInt          *diagIdx, *offdiagIdx;
2341   Vec                diagV, offdiagV;
2342   PetscScalar       *a, *diagA, *offdiagA;
2343   const PetscScalar *ba, *bav;
2344   PetscInt           r, j, col, ncols, *bi, *bj;
2345   Mat                B = mat->B;
2346   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2347 
2348   PetscFunctionBegin;
2349   /* When a process holds entire A and other processes have no entry */
2350   if (A->cmap->N == n) {
2351     PetscCall(VecGetArrayWrite(v, &diagA));
2352     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2353     PetscCall(MatGetRowMin(mat->A, diagV, idx));
2354     PetscCall(VecDestroy(&diagV));
2355     PetscCall(VecRestoreArrayWrite(v, &diagA));
2356     PetscFunctionReturn(PETSC_SUCCESS);
2357   } else if (n == 0) {
2358     if (m) {
2359       PetscCall(VecGetArrayWrite(v, &a));
2360       for (r = 0; r < m; r++) {
2361         a[r] = PETSC_MAX_REAL;
2362         if (idx) idx[r] = -1;
2363       }
2364       PetscCall(VecRestoreArrayWrite(v, &a));
2365     }
2366     PetscFunctionReturn(PETSC_SUCCESS);
2367   }
2368 
2369   PetscCall(PetscCalloc2(m, &diagIdx, m, &offdiagIdx));
2370   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2371   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2372   PetscCall(MatGetRowMin(mat->A, diagV, diagIdx));
2373 
2374   /* Get offdiagIdx[] for implicit 0.0 */
2375   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2376   ba = bav;
2377   bi = b->i;
2378   bj = b->j;
2379   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2380   for (r = 0; r < m; r++) {
2381     ncols = bi[r + 1] - bi[r];
2382     if (ncols == A->cmap->N - n) { /* Brow is dense */
2383       offdiagA[r]   = *ba;
2384       offdiagIdx[r] = cmap[0];
2385     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2386       offdiagA[r] = 0.0;
2387 
2388       /* Find first hole in the cmap */
2389       for (j = 0; j < ncols; j++) {
2390         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2391         if (col > j && j < cstart) {
2392           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2393           break;
2394         } else if (col > j + n && j >= cstart) {
2395           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2396           break;
2397         }
2398       }
2399       if (j == ncols && ncols < A->cmap->N - n) {
2400         /* a hole is outside compressed Bcols */
2401         if (ncols == 0) {
2402           if (cstart) {
2403             offdiagIdx[r] = 0;
2404           } else offdiagIdx[r] = cend;
2405         } else { /* ncols > 0 */
2406           offdiagIdx[r] = cmap[ncols - 1] + 1;
2407           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2408         }
2409       }
2410     }
2411 
2412     for (j = 0; j < ncols; j++) {
2413       if (PetscRealPart(offdiagA[r]) > PetscRealPart(*ba)) {
2414         offdiagA[r]   = *ba;
2415         offdiagIdx[r] = cmap[*bj];
2416       }
2417       ba++;
2418       bj++;
2419     }
2420   }
2421 
2422   PetscCall(VecGetArrayWrite(v, &a));
2423   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2424   for (r = 0; r < m; ++r) {
2425     if (PetscRealPart(diagA[r]) < PetscRealPart(offdiagA[r])) {
2426       a[r] = diagA[r];
2427       if (idx) idx[r] = cstart + diagIdx[r];
2428     } else if (PetscRealPart(diagA[r]) == PetscRealPart(offdiagA[r])) {
2429       a[r] = diagA[r];
2430       if (idx) {
2431         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2432           idx[r] = cstart + diagIdx[r];
2433         } else idx[r] = offdiagIdx[r];
2434       }
2435     } else {
2436       a[r] = offdiagA[r];
2437       if (idx) idx[r] = offdiagIdx[r];
2438     }
2439   }
2440   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2441   PetscCall(VecRestoreArrayWrite(v, &a));
2442   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2443   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2444   PetscCall(VecDestroy(&diagV));
2445   PetscCall(VecDestroy(&offdiagV));
2446   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2447   PetscFunctionReturn(PETSC_SUCCESS);
2448 }
2449 
2450 static PetscErrorCode MatGetRowMax_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2451 {
2452   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2453   PetscInt           m = A->rmap->n, n = A->cmap->n;
2454   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2455   PetscInt          *cmap = mat->garray;
2456   PetscInt          *diagIdx, *offdiagIdx;
2457   Vec                diagV, offdiagV;
2458   PetscScalar       *a, *diagA, *offdiagA;
2459   const PetscScalar *ba, *bav;
2460   PetscInt           r, j, col, ncols, *bi, *bj;
2461   Mat                B = mat->B;
2462   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2463 
2464   PetscFunctionBegin;
2465   /* When a process holds entire A and other processes have no entry */
2466   if (A->cmap->N == n) {
2467     PetscCall(VecGetArrayWrite(v, &diagA));
2468     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2469     PetscCall(MatGetRowMax(mat->A, diagV, idx));
2470     PetscCall(VecDestroy(&diagV));
2471     PetscCall(VecRestoreArrayWrite(v, &diagA));
2472     PetscFunctionReturn(PETSC_SUCCESS);
2473   } else if (n == 0) {
2474     if (m) {
2475       PetscCall(VecGetArrayWrite(v, &a));
2476       for (r = 0; r < m; r++) {
2477         a[r] = PETSC_MIN_REAL;
2478         if (idx) idx[r] = -1;
2479       }
2480       PetscCall(VecRestoreArrayWrite(v, &a));
2481     }
2482     PetscFunctionReturn(PETSC_SUCCESS);
2483   }
2484 
2485   PetscCall(PetscMalloc2(m, &diagIdx, m, &offdiagIdx));
2486   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2487   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2488   PetscCall(MatGetRowMax(mat->A, diagV, diagIdx));
2489 
2490   /* Get offdiagIdx[] for implicit 0.0 */
2491   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2492   ba = bav;
2493   bi = b->i;
2494   bj = b->j;
2495   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2496   for (r = 0; r < m; r++) {
2497     ncols = bi[r + 1] - bi[r];
2498     if (ncols == A->cmap->N - n) { /* Brow is dense */
2499       offdiagA[r]   = *ba;
2500       offdiagIdx[r] = cmap[0];
2501     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2502       offdiagA[r] = 0.0;
2503 
2504       /* Find first hole in the cmap */
2505       for (j = 0; j < ncols; j++) {
2506         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2507         if (col > j && j < cstart) {
2508           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2509           break;
2510         } else if (col > j + n && j >= cstart) {
2511           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2512           break;
2513         }
2514       }
2515       if (j == ncols && ncols < A->cmap->N - n) {
2516         /* a hole is outside compressed Bcols */
2517         if (ncols == 0) {
2518           if (cstart) {
2519             offdiagIdx[r] = 0;
2520           } else offdiagIdx[r] = cend;
2521         } else { /* ncols > 0 */
2522           offdiagIdx[r] = cmap[ncols - 1] + 1;
2523           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2524         }
2525       }
2526     }
2527 
2528     for (j = 0; j < ncols; j++) {
2529       if (PetscRealPart(offdiagA[r]) < PetscRealPart(*ba)) {
2530         offdiagA[r]   = *ba;
2531         offdiagIdx[r] = cmap[*bj];
2532       }
2533       ba++;
2534       bj++;
2535     }
2536   }
2537 
2538   PetscCall(VecGetArrayWrite(v, &a));
2539   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2540   for (r = 0; r < m; ++r) {
2541     if (PetscRealPart(diagA[r]) > PetscRealPart(offdiagA[r])) {
2542       a[r] = diagA[r];
2543       if (idx) idx[r] = cstart + diagIdx[r];
2544     } else if (PetscRealPart(diagA[r]) == PetscRealPart(offdiagA[r])) {
2545       a[r] = diagA[r];
2546       if (idx) {
2547         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2548           idx[r] = cstart + diagIdx[r];
2549         } else idx[r] = offdiagIdx[r];
2550       }
2551     } else {
2552       a[r] = offdiagA[r];
2553       if (idx) idx[r] = offdiagIdx[r];
2554     }
2555   }
2556   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2557   PetscCall(VecRestoreArrayWrite(v, &a));
2558   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2559   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2560   PetscCall(VecDestroy(&diagV));
2561   PetscCall(VecDestroy(&offdiagV));
2562   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2563   PetscFunctionReturn(PETSC_SUCCESS);
2564 }
2565 
2566 PetscErrorCode MatGetSeqNonzeroStructure_MPIAIJ(Mat mat, Mat *newmat)
2567 {
2568   Mat *dummy;
2569 
2570   PetscFunctionBegin;
2571   PetscCall(MatCreateSubMatrix_MPIAIJ_All(mat, MAT_DO_NOT_GET_VALUES, MAT_INITIAL_MATRIX, &dummy));
2572   *newmat = *dummy;
2573   PetscCall(PetscFree(dummy));
2574   PetscFunctionReturn(PETSC_SUCCESS);
2575 }
2576 
2577 static PetscErrorCode MatInvertBlockDiagonal_MPIAIJ(Mat A, const PetscScalar **values)
2578 {
2579   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2580 
2581   PetscFunctionBegin;
2582   PetscCall(MatInvertBlockDiagonal(a->A, values));
2583   A->factorerrortype = a->A->factorerrortype;
2584   PetscFunctionReturn(PETSC_SUCCESS);
2585 }
2586 
2587 static PetscErrorCode MatSetRandom_MPIAIJ(Mat x, PetscRandom rctx)
2588 {
2589   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)x->data;
2590 
2591   PetscFunctionBegin;
2592   PetscCheck(x->assembled || x->preallocated, PetscObjectComm((PetscObject)x), PETSC_ERR_ARG_WRONGSTATE, "MatSetRandom on an unassembled and unpreallocated MATMPIAIJ is not allowed");
2593   PetscCall(MatSetRandom(aij->A, rctx));
2594   if (x->assembled) {
2595     PetscCall(MatSetRandom(aij->B, rctx));
2596   } else {
2597     PetscCall(MatSetRandomSkipColumnRange_SeqAIJ_Private(aij->B, x->cmap->rstart, x->cmap->rend, rctx));
2598   }
2599   PetscCall(MatAssemblyBegin(x, MAT_FINAL_ASSEMBLY));
2600   PetscCall(MatAssemblyEnd(x, MAT_FINAL_ASSEMBLY));
2601   PetscFunctionReturn(PETSC_SUCCESS);
2602 }
2603 
2604 static PetscErrorCode MatMPIAIJSetUseScalableIncreaseOverlap_MPIAIJ(Mat A, PetscBool sc)
2605 {
2606   PetscFunctionBegin;
2607   if (sc) A->ops->increaseoverlap = MatIncreaseOverlap_MPIAIJ_Scalable;
2608   else A->ops->increaseoverlap = MatIncreaseOverlap_MPIAIJ;
2609   PetscFunctionReturn(PETSC_SUCCESS);
2610 }
2611 
2612 /*@
2613   MatMPIAIJGetNumberNonzeros - gets the number of nonzeros in the matrix on this MPI rank
2614 
2615   Not Collective
2616 
2617   Input Parameter:
2618 . A - the matrix
2619 
2620   Output Parameter:
2621 . nz - the number of nonzeros
2622 
2623   Level: advanced
2624 
2625 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`
2626 @*/
2627 PetscErrorCode MatMPIAIJGetNumberNonzeros(Mat A, PetscCount *nz)
2628 {
2629   Mat_MPIAIJ *maij = (Mat_MPIAIJ *)A->data;
2630   Mat_SeqAIJ *aaij = (Mat_SeqAIJ *)maij->A->data, *baij = (Mat_SeqAIJ *)maij->B->data;
2631   PetscBool   isaij;
2632 
2633   PetscFunctionBegin;
2634   PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATMPIAIJ, &isaij));
2635   PetscCheck(isaij, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
2636   *nz = aaij->i[A->rmap->n] + baij->i[A->rmap->n];
2637   PetscFunctionReturn(PETSC_SUCCESS);
2638 }
2639 
2640 /*@
2641   MatMPIAIJSetUseScalableIncreaseOverlap - Determine if the matrix uses a scalable algorithm to compute the overlap
2642 
2643   Collective
2644 
2645   Input Parameters:
2646 + A  - the matrix
2647 - sc - `PETSC_TRUE` indicates use the scalable algorithm (default is not to use the scalable algorithm)
2648 
2649   Level: advanced
2650 
2651 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`
2652 @*/
2653 PetscErrorCode MatMPIAIJSetUseScalableIncreaseOverlap(Mat A, PetscBool sc)
2654 {
2655   PetscFunctionBegin;
2656   PetscTryMethod(A, "MatMPIAIJSetUseScalableIncreaseOverlap_C", (Mat, PetscBool), (A, sc));
2657   PetscFunctionReturn(PETSC_SUCCESS);
2658 }
2659 
2660 PetscErrorCode MatSetFromOptions_MPIAIJ(Mat A, PetscOptionItems PetscOptionsObject)
2661 {
2662   PetscBool sc = PETSC_FALSE, flg;
2663 
2664   PetscFunctionBegin;
2665   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJ options");
2666   if (A->ops->increaseoverlap == MatIncreaseOverlap_MPIAIJ_Scalable) sc = PETSC_TRUE;
2667   PetscCall(PetscOptionsBool("-mat_increase_overlap_scalable", "Use a scalable algorithm to compute the overlap", "MatIncreaseOverlap", sc, &sc, &flg));
2668   if (flg) PetscCall(MatMPIAIJSetUseScalableIncreaseOverlap(A, sc));
2669   PetscOptionsHeadEnd();
2670   PetscFunctionReturn(PETSC_SUCCESS);
2671 }
2672 
2673 static PetscErrorCode MatShift_MPIAIJ(Mat Y, PetscScalar a)
2674 {
2675   Mat_MPIAIJ *maij = (Mat_MPIAIJ *)Y->data;
2676   Mat_SeqAIJ *aij  = (Mat_SeqAIJ *)maij->A->data;
2677 
2678   PetscFunctionBegin;
2679   if (!Y->preallocated) {
2680     PetscCall(MatMPIAIJSetPreallocation(Y, 1, NULL, 0, NULL));
2681   } else if (!aij->nz) { /* It does not matter if diagonals of Y only partially lie in maij->A. We just need an estimated preallocation. */
2682     PetscInt nonew = aij->nonew;
2683     PetscCall(MatSeqAIJSetPreallocation(maij->A, 1, NULL));
2684     aij->nonew = nonew;
2685   }
2686   PetscCall(MatShift_Basic(Y, a));
2687   PetscFunctionReturn(PETSC_SUCCESS);
2688 }
2689 
2690 static PetscErrorCode MatMissingDiagonal_MPIAIJ(Mat A, PetscBool *missing, PetscInt *d)
2691 {
2692   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2693 
2694   PetscFunctionBegin;
2695   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only works for square matrices");
2696   PetscCall(MatMissingDiagonal(a->A, missing, d));
2697   if (d) {
2698     PetscInt rstart;
2699     PetscCall(MatGetOwnershipRange(A, &rstart, NULL));
2700     *d += rstart;
2701   }
2702   PetscFunctionReturn(PETSC_SUCCESS);
2703 }
2704 
2705 static PetscErrorCode MatInvertVariableBlockDiagonal_MPIAIJ(Mat A, PetscInt nblocks, const PetscInt *bsizes, PetscScalar *diag)
2706 {
2707   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2708 
2709   PetscFunctionBegin;
2710   PetscCall(MatInvertVariableBlockDiagonal(a->A, nblocks, bsizes, diag));
2711   PetscFunctionReturn(PETSC_SUCCESS);
2712 }
2713 
2714 static PetscErrorCode MatEliminateZeros_MPIAIJ(Mat A, PetscBool keep)
2715 {
2716   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2717 
2718   PetscFunctionBegin;
2719   PetscCall(MatEliminateZeros_SeqAIJ(a->A, keep));        // possibly keep zero diagonal coefficients
2720   PetscCall(MatEliminateZeros_SeqAIJ(a->B, PETSC_FALSE)); // never keep zero diagonal coefficients
2721   PetscFunctionReturn(PETSC_SUCCESS);
2722 }
2723 
2724 static struct _MatOps MatOps_Values = {MatSetValues_MPIAIJ,
2725                                        MatGetRow_MPIAIJ,
2726                                        MatRestoreRow_MPIAIJ,
2727                                        MatMult_MPIAIJ,
2728                                        /* 4*/ MatMultAdd_MPIAIJ,
2729                                        MatMultTranspose_MPIAIJ,
2730                                        MatMultTransposeAdd_MPIAIJ,
2731                                        NULL,
2732                                        NULL,
2733                                        NULL,
2734                                        /*10*/ NULL,
2735                                        NULL,
2736                                        NULL,
2737                                        MatSOR_MPIAIJ,
2738                                        MatTranspose_MPIAIJ,
2739                                        /*15*/ MatGetInfo_MPIAIJ,
2740                                        MatEqual_MPIAIJ,
2741                                        MatGetDiagonal_MPIAIJ,
2742                                        MatDiagonalScale_MPIAIJ,
2743                                        MatNorm_MPIAIJ,
2744                                        /*20*/ MatAssemblyBegin_MPIAIJ,
2745                                        MatAssemblyEnd_MPIAIJ,
2746                                        MatSetOption_MPIAIJ,
2747                                        MatZeroEntries_MPIAIJ,
2748                                        /*24*/ MatZeroRows_MPIAIJ,
2749                                        NULL,
2750                                        NULL,
2751                                        NULL,
2752                                        NULL,
2753                                        /*29*/ MatSetUp_MPI_Hash,
2754                                        NULL,
2755                                        NULL,
2756                                        MatGetDiagonalBlock_MPIAIJ,
2757                                        NULL,
2758                                        /*34*/ MatDuplicate_MPIAIJ,
2759                                        NULL,
2760                                        NULL,
2761                                        NULL,
2762                                        NULL,
2763                                        /*39*/ MatAXPY_MPIAIJ,
2764                                        MatCreateSubMatrices_MPIAIJ,
2765                                        MatIncreaseOverlap_MPIAIJ,
2766                                        MatGetValues_MPIAIJ,
2767                                        MatCopy_MPIAIJ,
2768                                        /*44*/ MatGetRowMax_MPIAIJ,
2769                                        MatScale_MPIAIJ,
2770                                        MatShift_MPIAIJ,
2771                                        MatDiagonalSet_MPIAIJ,
2772                                        MatZeroRowsColumns_MPIAIJ,
2773                                        /*49*/ MatSetRandom_MPIAIJ,
2774                                        MatGetRowIJ_MPIAIJ,
2775                                        MatRestoreRowIJ_MPIAIJ,
2776                                        NULL,
2777                                        NULL,
2778                                        /*54*/ MatFDColoringCreate_MPIXAIJ,
2779                                        NULL,
2780                                        MatSetUnfactored_MPIAIJ,
2781                                        MatPermute_MPIAIJ,
2782                                        NULL,
2783                                        /*59*/ MatCreateSubMatrix_MPIAIJ,
2784                                        MatDestroy_MPIAIJ,
2785                                        MatView_MPIAIJ,
2786                                        NULL,
2787                                        NULL,
2788                                        /*64*/ MatMatMatMultNumeric_MPIAIJ_MPIAIJ_MPIAIJ,
2789                                        NULL,
2790                                        NULL,
2791                                        NULL,
2792                                        MatGetRowMaxAbs_MPIAIJ,
2793                                        /*69*/ MatGetRowMinAbs_MPIAIJ,
2794                                        NULL,
2795                                        NULL,
2796                                        MatFDColoringApply_AIJ,
2797                                        MatSetFromOptions_MPIAIJ,
2798                                        MatFindZeroDiagonals_MPIAIJ,
2799                                        /*75*/ NULL,
2800                                        NULL,
2801                                        NULL,
2802                                        MatLoad_MPIAIJ,
2803                                        NULL,
2804                                        /*80*/ NULL,
2805                                        NULL,
2806                                        NULL,
2807                                        /*83*/ NULL,
2808                                        NULL,
2809                                        MatMatMultNumeric_MPIAIJ_MPIAIJ,
2810                                        MatPtAPNumeric_MPIAIJ_MPIAIJ,
2811                                        NULL,
2812                                        NULL,
2813                                        /*89*/ MatBindToCPU_MPIAIJ,
2814                                        MatProductSetFromOptions_MPIAIJ,
2815                                        NULL,
2816                                        NULL,
2817                                        MatConjugate_MPIAIJ,
2818                                        /*94*/ NULL,
2819                                        MatSetValuesRow_MPIAIJ,
2820                                        MatRealPart_MPIAIJ,
2821                                        MatImaginaryPart_MPIAIJ,
2822                                        NULL,
2823                                        /*99*/ NULL,
2824                                        NULL,
2825                                        NULL,
2826                                        MatGetRowMin_MPIAIJ,
2827                                        NULL,
2828                                        /*104*/ MatMissingDiagonal_MPIAIJ,
2829                                        MatGetSeqNonzeroStructure_MPIAIJ,
2830                                        NULL,
2831                                        MatGetGhosts_MPIAIJ,
2832                                        NULL,
2833                                        /*109*/ NULL,
2834                                        MatMultDiagonalBlock_MPIAIJ,
2835                                        NULL,
2836                                        NULL,
2837                                        NULL,
2838                                        /*114*/ MatGetMultiProcBlock_MPIAIJ,
2839                                        MatFindNonzeroRows_MPIAIJ,
2840                                        MatGetColumnReductions_MPIAIJ,
2841                                        MatInvertBlockDiagonal_MPIAIJ,
2842                                        MatInvertVariableBlockDiagonal_MPIAIJ,
2843                                        /*119*/ MatCreateSubMatricesMPI_MPIAIJ,
2844                                        NULL,
2845                                        NULL,
2846                                        MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ,
2847                                        NULL,
2848                                        /*124*/ NULL,
2849                                        NULL,
2850                                        NULL,
2851                                        MatSetBlockSizes_MPIAIJ,
2852                                        NULL,
2853                                        /*129*/ MatFDColoringSetUp_MPIXAIJ,
2854                                        MatFindOffBlockDiagonalEntries_MPIAIJ,
2855                                        MatCreateMPIMatConcatenateSeqMat_MPIAIJ,
2856                                        NULL,
2857                                        NULL,
2858                                        /*134*/ NULL,
2859                                        MatCreateGraph_Simple_AIJ,
2860                                        NULL,
2861                                        MatEliminateZeros_MPIAIJ,
2862                                        MatGetRowSumAbs_MPIAIJ,
2863                                        /*139*/ NULL,
2864                                        NULL,
2865                                        NULL,
2866                                        MatCopyHashToXAIJ_MPI_Hash,
2867                                        MatGetCurrentMemType_MPIAIJ};
2868 
2869 static PetscErrorCode MatStoreValues_MPIAIJ(Mat mat)
2870 {
2871   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2872 
2873   PetscFunctionBegin;
2874   PetscCall(MatStoreValues(aij->A));
2875   PetscCall(MatStoreValues(aij->B));
2876   PetscFunctionReturn(PETSC_SUCCESS);
2877 }
2878 
2879 static PetscErrorCode MatRetrieveValues_MPIAIJ(Mat mat)
2880 {
2881   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2882 
2883   PetscFunctionBegin;
2884   PetscCall(MatRetrieveValues(aij->A));
2885   PetscCall(MatRetrieveValues(aij->B));
2886   PetscFunctionReturn(PETSC_SUCCESS);
2887 }
2888 
2889 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJ(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
2890 {
2891   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2892   PetscMPIInt size;
2893 
2894   PetscFunctionBegin;
2895   if (B->hash_active) {
2896     B->ops[0]      = b->cops;
2897     B->hash_active = PETSC_FALSE;
2898   }
2899   PetscCall(PetscLayoutSetUp(B->rmap));
2900   PetscCall(PetscLayoutSetUp(B->cmap));
2901 
2902 #if defined(PETSC_USE_CTABLE)
2903   PetscCall(PetscHMapIDestroy(&b->colmap));
2904 #else
2905   PetscCall(PetscFree(b->colmap));
2906 #endif
2907   PetscCall(PetscFree(b->garray));
2908   PetscCall(VecDestroy(&b->lvec));
2909   PetscCall(VecScatterDestroy(&b->Mvctx));
2910 
2911   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2912 
2913   MatSeqXAIJGetOptions_Private(b->B);
2914   PetscCall(MatDestroy(&b->B));
2915   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
2916   PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
2917   PetscCall(MatSetBlockSizesFromMats(b->B, B, B));
2918   PetscCall(MatSetType(b->B, MATSEQAIJ));
2919   MatSeqXAIJRestoreOptions_Private(b->B);
2920 
2921   MatSeqXAIJGetOptions_Private(b->A);
2922   PetscCall(MatDestroy(&b->A));
2923   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
2924   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
2925   PetscCall(MatSetBlockSizesFromMats(b->A, B, B));
2926   PetscCall(MatSetType(b->A, MATSEQAIJ));
2927   MatSeqXAIJRestoreOptions_Private(b->A);
2928 
2929   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
2930   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
2931   B->preallocated  = PETSC_TRUE;
2932   B->was_assembled = PETSC_FALSE;
2933   B->assembled     = PETSC_FALSE;
2934   PetscFunctionReturn(PETSC_SUCCESS);
2935 }
2936 
2937 static PetscErrorCode MatResetPreallocation_MPIAIJ(Mat B)
2938 {
2939   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2940   PetscBool   ondiagreset, offdiagreset, memoryreset;
2941 
2942   PetscFunctionBegin;
2943   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
2944   PetscCheck(B->insertmode == NOT_SET_VALUES, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot reset preallocation after setting some values but not yet calling MatAssemblyBegin()/MatAssemblyEnd()");
2945   if (B->num_ass == 0) PetscFunctionReturn(PETSC_SUCCESS);
2946 
2947   PetscCall(MatResetPreallocation_SeqAIJ_Private(b->A, &ondiagreset));
2948   PetscCall(MatResetPreallocation_SeqAIJ_Private(b->B, &offdiagreset));
2949   memoryreset = (PetscBool)(ondiagreset || offdiagreset);
2950   PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &memoryreset, 1, MPIU_BOOL, MPI_LOR, PetscObjectComm((PetscObject)B)));
2951   if (!memoryreset) PetscFunctionReturn(PETSC_SUCCESS);
2952 
2953   PetscCall(PetscLayoutSetUp(B->rmap));
2954   PetscCall(PetscLayoutSetUp(B->cmap));
2955   PetscCheck(B->assembled || B->was_assembled, PetscObjectComm((PetscObject)B), PETSC_ERR_ARG_WRONGSTATE, "Should not need to reset preallocation if the matrix was never assembled");
2956   PetscCall(MatDisAssemble_MPIAIJ(B, PETSC_TRUE));
2957   PetscCall(VecScatterDestroy(&b->Mvctx));
2958 
2959   B->preallocated  = PETSC_TRUE;
2960   B->was_assembled = PETSC_FALSE;
2961   B->assembled     = PETSC_FALSE;
2962   /* Log that the state of this object has changed; this will help guarantee that preconditioners get re-setup */
2963   PetscCall(PetscObjectStateIncrease((PetscObject)B));
2964   PetscFunctionReturn(PETSC_SUCCESS);
2965 }
2966 
2967 PetscErrorCode MatDuplicate_MPIAIJ(Mat matin, MatDuplicateOption cpvalues, Mat *newmat)
2968 {
2969   Mat         mat;
2970   Mat_MPIAIJ *a, *oldmat = (Mat_MPIAIJ *)matin->data;
2971 
2972   PetscFunctionBegin;
2973   *newmat = NULL;
2974   PetscCall(MatCreate(PetscObjectComm((PetscObject)matin), &mat));
2975   PetscCall(MatSetSizes(mat, matin->rmap->n, matin->cmap->n, matin->rmap->N, matin->cmap->N));
2976   PetscCall(MatSetBlockSizesFromMats(mat, matin, matin));
2977   PetscCall(MatSetType(mat, ((PetscObject)matin)->type_name));
2978   a = (Mat_MPIAIJ *)mat->data;
2979 
2980   mat->factortype = matin->factortype;
2981   mat->assembled  = matin->assembled;
2982   mat->insertmode = NOT_SET_VALUES;
2983 
2984   a->size         = oldmat->size;
2985   a->rank         = oldmat->rank;
2986   a->donotstash   = oldmat->donotstash;
2987   a->roworiented  = oldmat->roworiented;
2988   a->rowindices   = NULL;
2989   a->rowvalues    = NULL;
2990   a->getrowactive = PETSC_FALSE;
2991 
2992   PetscCall(PetscLayoutReference(matin->rmap, &mat->rmap));
2993   PetscCall(PetscLayoutReference(matin->cmap, &mat->cmap));
2994   if (matin->hash_active) {
2995     PetscCall(MatSetUp(mat));
2996   } else {
2997     mat->preallocated = matin->preallocated;
2998     if (oldmat->colmap) {
2999 #if defined(PETSC_USE_CTABLE)
3000       PetscCall(PetscHMapIDuplicate(oldmat->colmap, &a->colmap));
3001 #else
3002       PetscCall(PetscMalloc1(mat->cmap->N, &a->colmap));
3003       PetscCall(PetscArraycpy(a->colmap, oldmat->colmap, mat->cmap->N));
3004 #endif
3005     } else a->colmap = NULL;
3006     if (oldmat->garray) {
3007       PetscInt len;
3008       len = oldmat->B->cmap->n;
3009       PetscCall(PetscMalloc1(len + 1, &a->garray));
3010       if (len) PetscCall(PetscArraycpy(a->garray, oldmat->garray, len));
3011     } else a->garray = NULL;
3012 
3013     /* It may happen MatDuplicate is called with a non-assembled matrix
3014       In fact, MatDuplicate only requires the matrix to be preallocated
3015       This may happen inside a DMCreateMatrix_Shell */
3016     if (oldmat->lvec) PetscCall(VecDuplicate(oldmat->lvec, &a->lvec));
3017     if (oldmat->Mvctx) {
3018       a->Mvctx = oldmat->Mvctx;
3019       PetscCall(PetscObjectReference((PetscObject)oldmat->Mvctx));
3020     }
3021     PetscCall(MatDuplicate(oldmat->A, cpvalues, &a->A));
3022     PetscCall(MatDuplicate(oldmat->B, cpvalues, &a->B));
3023   }
3024   PetscCall(PetscFunctionListDuplicate(((PetscObject)matin)->qlist, &((PetscObject)mat)->qlist));
3025   *newmat = mat;
3026   PetscFunctionReturn(PETSC_SUCCESS);
3027 }
3028 
3029 PetscErrorCode MatLoad_MPIAIJ(Mat newMat, PetscViewer viewer)
3030 {
3031   PetscBool isbinary, ishdf5;
3032 
3033   PetscFunctionBegin;
3034   PetscValidHeaderSpecific(newMat, MAT_CLASSID, 1);
3035   PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
3036   /* force binary viewer to load .info file if it has not yet done so */
3037   PetscCall(PetscViewerSetUp(viewer));
3038   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
3039   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERHDF5, &ishdf5));
3040   if (isbinary) {
3041     PetscCall(MatLoad_MPIAIJ_Binary(newMat, viewer));
3042   } else if (ishdf5) {
3043 #if defined(PETSC_HAVE_HDF5)
3044     PetscCall(MatLoad_AIJ_HDF5(newMat, viewer));
3045 #else
3046     SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "HDF5 not supported in this build.\nPlease reconfigure using --download-hdf5");
3047 #endif
3048   } else {
3049     SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "Viewer type %s not yet supported for reading %s matrices", ((PetscObject)viewer)->type_name, ((PetscObject)newMat)->type_name);
3050   }
3051   PetscFunctionReturn(PETSC_SUCCESS);
3052 }
3053 
3054 PetscErrorCode MatLoad_MPIAIJ_Binary(Mat mat, PetscViewer viewer)
3055 {
3056   PetscInt     header[4], M, N, m, nz, rows, cols, sum, i;
3057   PetscInt    *rowidxs, *colidxs;
3058   PetscScalar *matvals;
3059 
3060   PetscFunctionBegin;
3061   PetscCall(PetscViewerSetUp(viewer));
3062 
3063   /* read in matrix header */
3064   PetscCall(PetscViewerBinaryRead(viewer, header, 4, NULL, PETSC_INT));
3065   PetscCheck(header[0] == MAT_FILE_CLASSID, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Not a matrix object in file");
3066   M  = header[1];
3067   N  = header[2];
3068   nz = header[3];
3069   PetscCheck(M >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix row size (%" PetscInt_FMT ") in file is negative", M);
3070   PetscCheck(N >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix column size (%" PetscInt_FMT ") in file is negative", N);
3071   PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix stored in special format on disk, cannot load as MPIAIJ");
3072 
3073   /* set block sizes from the viewer's .info file */
3074   PetscCall(MatLoad_Binary_BlockSizes(mat, viewer));
3075   /* set global sizes if not set already */
3076   if (mat->rmap->N < 0) mat->rmap->N = M;
3077   if (mat->cmap->N < 0) mat->cmap->N = N;
3078   PetscCall(PetscLayoutSetUp(mat->rmap));
3079   PetscCall(PetscLayoutSetUp(mat->cmap));
3080 
3081   /* check if the matrix sizes are correct */
3082   PetscCall(MatGetSize(mat, &rows, &cols));
3083   PetscCheck(M == rows && N == cols, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix in file of different sizes (%" PetscInt_FMT ", %" PetscInt_FMT ") than the input matrix (%" PetscInt_FMT ", %" PetscInt_FMT ")", M, N, rows, cols);
3084 
3085   /* read in row lengths and build row indices */
3086   PetscCall(MatGetLocalSize(mat, &m, NULL));
3087   PetscCall(PetscMalloc1(m + 1, &rowidxs));
3088   PetscCall(PetscViewerBinaryReadAll(viewer, rowidxs + 1, m, PETSC_DECIDE, M, PETSC_INT));
3089   rowidxs[0] = 0;
3090   for (i = 0; i < m; i++) rowidxs[i + 1] += rowidxs[i];
3091   if (nz != PETSC_INT_MAX) {
3092     PetscCallMPI(MPIU_Allreduce(&rowidxs[m], &sum, 1, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)viewer)));
3093     PetscCheck(sum == nz, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Inconsistent matrix data in file: nonzeros = %" PetscInt_FMT ", sum-row-lengths = %" PetscInt_FMT, nz, sum);
3094   }
3095 
3096   /* read in column indices and matrix values */
3097   PetscCall(PetscMalloc2(rowidxs[m], &colidxs, rowidxs[m], &matvals));
3098   PetscCall(PetscViewerBinaryReadAll(viewer, colidxs, rowidxs[m], PETSC_DETERMINE, PETSC_DETERMINE, PETSC_INT));
3099   PetscCall(PetscViewerBinaryReadAll(viewer, matvals, rowidxs[m], PETSC_DETERMINE, PETSC_DETERMINE, PETSC_SCALAR));
3100   /* store matrix indices and values */
3101   PetscCall(MatMPIAIJSetPreallocationCSR(mat, rowidxs, colidxs, matvals));
3102   PetscCall(PetscFree(rowidxs));
3103   PetscCall(PetscFree2(colidxs, matvals));
3104   PetscFunctionReturn(PETSC_SUCCESS);
3105 }
3106 
3107 /* Not scalable because of ISAllGather() unless getting all columns. */
3108 static PetscErrorCode ISGetSeqIS_Private(Mat mat, IS iscol, IS *isseq)
3109 {
3110   IS          iscol_local;
3111   PetscBool   isstride;
3112   PetscMPIInt gisstride = 0;
3113 
3114   PetscFunctionBegin;
3115   /* check if we are grabbing all columns*/
3116   PetscCall(PetscObjectTypeCompare((PetscObject)iscol, ISSTRIDE, &isstride));
3117 
3118   if (isstride) {
3119     PetscInt start, len, mstart, mlen;
3120     PetscCall(ISStrideGetInfo(iscol, &start, NULL));
3121     PetscCall(ISGetLocalSize(iscol, &len));
3122     PetscCall(MatGetOwnershipRangeColumn(mat, &mstart, &mlen));
3123     if (mstart == start && mlen - mstart == len) gisstride = 1;
3124   }
3125 
3126   PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &gisstride, 1, MPI_INT, MPI_MIN, PetscObjectComm((PetscObject)mat)));
3127   if (gisstride) {
3128     PetscInt N;
3129     PetscCall(MatGetSize(mat, NULL, &N));
3130     PetscCall(ISCreateStride(PETSC_COMM_SELF, N, 0, 1, &iscol_local));
3131     PetscCall(ISSetIdentity(iscol_local));
3132     PetscCall(PetscInfo(mat, "Optimizing for obtaining all columns of the matrix; skipping ISAllGather()\n"));
3133   } else {
3134     PetscInt cbs;
3135     PetscCall(ISGetBlockSize(iscol, &cbs));
3136     PetscCall(ISAllGather(iscol, &iscol_local));
3137     PetscCall(ISSetBlockSize(iscol_local, cbs));
3138   }
3139 
3140   *isseq = iscol_local;
3141   PetscFunctionReturn(PETSC_SUCCESS);
3142 }
3143 
3144 /*
3145  Used by MatCreateSubMatrix_MPIAIJ_SameRowColDist() to avoid ISAllGather() and global size of iscol_local
3146  (see MatCreateSubMatrix_MPIAIJ_nonscalable)
3147 
3148  Input Parameters:
3149 +   mat - matrix
3150 .   isrow - parallel row index set; its local indices are a subset of local columns of `mat`,
3151            i.e., mat->rstart <= isrow[i] < mat->rend
3152 -   iscol - parallel column index set; its local indices are a subset of local columns of `mat`,
3153            i.e., mat->cstart <= iscol[i] < mat->cend
3154 
3155  Output Parameters:
3156 +   isrow_d - sequential row index set for retrieving mat->A
3157 .   iscol_d - sequential  column index set for retrieving mat->A
3158 .   iscol_o - sequential column index set for retrieving mat->B
3159 -   garray - column map; garray[i] indicates global location of iscol_o[i] in `iscol`
3160  */
3161 static PetscErrorCode ISGetSeqIS_SameColDist_Private(Mat mat, IS isrow, IS iscol, IS *isrow_d, IS *iscol_d, IS *iscol_o, PetscInt *garray[])
3162 {
3163   Vec             x, cmap;
3164   const PetscInt *is_idx;
3165   PetscScalar    *xarray, *cmaparray;
3166   PetscInt        ncols, isstart, *idx, m, rstart, *cmap1, count;
3167   Mat_MPIAIJ     *a    = (Mat_MPIAIJ *)mat->data;
3168   Mat             B    = a->B;
3169   Vec             lvec = a->lvec, lcmap;
3170   PetscInt        i, cstart, cend, Bn = B->cmap->N;
3171   MPI_Comm        comm;
3172   VecScatter      Mvctx = a->Mvctx;
3173 
3174   PetscFunctionBegin;
3175   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3176   PetscCall(ISGetLocalSize(iscol, &ncols));
3177 
3178   /* (1) iscol is a sub-column vector of mat, pad it with '-1.' to form a full vector x */
3179   PetscCall(MatCreateVecs(mat, &x, NULL));
3180   PetscCall(VecSet(x, -1.0));
3181   PetscCall(VecDuplicate(x, &cmap));
3182   PetscCall(VecSet(cmap, -1.0));
3183 
3184   /* Get start indices */
3185   PetscCallMPI(MPI_Scan(&ncols, &isstart, 1, MPIU_INT, MPI_SUM, comm));
3186   isstart -= ncols;
3187   PetscCall(MatGetOwnershipRangeColumn(mat, &cstart, &cend));
3188 
3189   PetscCall(ISGetIndices(iscol, &is_idx));
3190   PetscCall(VecGetArray(x, &xarray));
3191   PetscCall(VecGetArray(cmap, &cmaparray));
3192   PetscCall(PetscMalloc1(ncols, &idx));
3193   for (i = 0; i < ncols; i++) {
3194     xarray[is_idx[i] - cstart]    = (PetscScalar)is_idx[i];
3195     cmaparray[is_idx[i] - cstart] = i + isstart;        /* global index of iscol[i] */
3196     idx[i]                        = is_idx[i] - cstart; /* local index of iscol[i]  */
3197   }
3198   PetscCall(VecRestoreArray(x, &xarray));
3199   PetscCall(VecRestoreArray(cmap, &cmaparray));
3200   PetscCall(ISRestoreIndices(iscol, &is_idx));
3201 
3202   /* Get iscol_d */
3203   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, iscol_d));
3204   PetscCall(ISGetBlockSize(iscol, &i));
3205   PetscCall(ISSetBlockSize(*iscol_d, i));
3206 
3207   /* Get isrow_d */
3208   PetscCall(ISGetLocalSize(isrow, &m));
3209   rstart = mat->rmap->rstart;
3210   PetscCall(PetscMalloc1(m, &idx));
3211   PetscCall(ISGetIndices(isrow, &is_idx));
3212   for (i = 0; i < m; i++) idx[i] = is_idx[i] - rstart;
3213   PetscCall(ISRestoreIndices(isrow, &is_idx));
3214 
3215   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, m, idx, PETSC_OWN_POINTER, isrow_d));
3216   PetscCall(ISGetBlockSize(isrow, &i));
3217   PetscCall(ISSetBlockSize(*isrow_d, i));
3218 
3219   /* (2) Scatter x and cmap using aij->Mvctx to get their off-process portions (see MatMult_MPIAIJ) */
3220   PetscCall(VecScatterBegin(Mvctx, x, lvec, INSERT_VALUES, SCATTER_FORWARD));
3221   PetscCall(VecScatterEnd(Mvctx, x, lvec, INSERT_VALUES, SCATTER_FORWARD));
3222 
3223   PetscCall(VecDuplicate(lvec, &lcmap));
3224 
3225   PetscCall(VecScatterBegin(Mvctx, cmap, lcmap, INSERT_VALUES, SCATTER_FORWARD));
3226   PetscCall(VecScatterEnd(Mvctx, cmap, lcmap, INSERT_VALUES, SCATTER_FORWARD));
3227 
3228   /* (3) create sequential iscol_o (a subset of iscol) and isgarray */
3229   /* off-process column indices */
3230   count = 0;
3231   PetscCall(PetscMalloc1(Bn, &idx));
3232   PetscCall(PetscMalloc1(Bn, &cmap1));
3233 
3234   PetscCall(VecGetArray(lvec, &xarray));
3235   PetscCall(VecGetArray(lcmap, &cmaparray));
3236   for (i = 0; i < Bn; i++) {
3237     if (PetscRealPart(xarray[i]) > -1.0) {
3238       idx[count]   = i;                                     /* local column index in off-diagonal part B */
3239       cmap1[count] = (PetscInt)PetscRealPart(cmaparray[i]); /* column index in submat */
3240       count++;
3241     }
3242   }
3243   PetscCall(VecRestoreArray(lvec, &xarray));
3244   PetscCall(VecRestoreArray(lcmap, &cmaparray));
3245 
3246   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, idx, PETSC_COPY_VALUES, iscol_o));
3247   /* cannot ensure iscol_o has same blocksize as iscol! */
3248 
3249   PetscCall(PetscFree(idx));
3250   *garray = cmap1;
3251 
3252   PetscCall(VecDestroy(&x));
3253   PetscCall(VecDestroy(&cmap));
3254   PetscCall(VecDestroy(&lcmap));
3255   PetscFunctionReturn(PETSC_SUCCESS);
3256 }
3257 
3258 /* isrow and iscol have same processor distribution as mat, output *submat is a submatrix of local mat */
3259 PetscErrorCode MatCreateSubMatrix_MPIAIJ_SameRowColDist(Mat mat, IS isrow, IS iscol, MatReuse call, Mat *submat)
3260 {
3261   Mat_MPIAIJ *a = (Mat_MPIAIJ *)mat->data, *asub;
3262   Mat         M = NULL;
3263   MPI_Comm    comm;
3264   IS          iscol_d, isrow_d, iscol_o;
3265   Mat         Asub = NULL, Bsub = NULL;
3266   PetscInt    n, count, M_size, N_size;
3267 
3268   PetscFunctionBegin;
3269   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3270 
3271   if (call == MAT_REUSE_MATRIX) {
3272     /* Retrieve isrow_d, iscol_d and iscol_o from submat */
3273     PetscCall(PetscObjectQuery((PetscObject)*submat, "isrow_d", (PetscObject *)&isrow_d));
3274     PetscCheck(isrow_d, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "isrow_d passed in was not used before, cannot reuse");
3275 
3276     PetscCall(PetscObjectQuery((PetscObject)*submat, "iscol_d", (PetscObject *)&iscol_d));
3277     PetscCheck(iscol_d, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_d passed in was not used before, cannot reuse");
3278 
3279     PetscCall(PetscObjectQuery((PetscObject)*submat, "iscol_o", (PetscObject *)&iscol_o));
3280     PetscCheck(iscol_o, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_o passed in was not used before, cannot reuse");
3281 
3282     /* Update diagonal and off-diagonal portions of submat */
3283     asub = (Mat_MPIAIJ *)(*submat)->data;
3284     PetscCall(MatCreateSubMatrix_SeqAIJ(a->A, isrow_d, iscol_d, PETSC_DECIDE, MAT_REUSE_MATRIX, &asub->A));
3285     PetscCall(ISGetLocalSize(iscol_o, &n));
3286     if (n) PetscCall(MatCreateSubMatrix_SeqAIJ(a->B, isrow_d, iscol_o, PETSC_DECIDE, MAT_REUSE_MATRIX, &asub->B));
3287     PetscCall(MatAssemblyBegin(*submat, MAT_FINAL_ASSEMBLY));
3288     PetscCall(MatAssemblyEnd(*submat, MAT_FINAL_ASSEMBLY));
3289 
3290   } else { /* call == MAT_INITIAL_MATRIX) */
3291     PetscInt *garray, *garray_compact;
3292     PetscInt  BsubN;
3293 
3294     /* Create isrow_d, iscol_d, iscol_o and isgarray (replace isgarray with array?) */
3295     PetscCall(ISGetSeqIS_SameColDist_Private(mat, isrow, iscol, &isrow_d, &iscol_d, &iscol_o, &garray));
3296 
3297     /* Create local submatrices Asub and Bsub */
3298     PetscCall(MatCreateSubMatrix_SeqAIJ(a->A, isrow_d, iscol_d, PETSC_DECIDE, MAT_INITIAL_MATRIX, &Asub));
3299     PetscCall(MatCreateSubMatrix_SeqAIJ(a->B, isrow_d, iscol_o, PETSC_DECIDE, MAT_INITIAL_MATRIX, &Bsub));
3300 
3301     // Compact garray so its not of size Bn
3302     PetscCall(ISGetSize(iscol_o, &count));
3303     PetscCall(PetscMalloc1(count, &garray_compact));
3304     PetscCall(PetscArraycpy(garray_compact, garray, count));
3305 
3306     /* Create submatrix M */
3307     PetscCall(ISGetSize(isrow, &M_size));
3308     PetscCall(ISGetSize(iscol, &N_size));
3309     PetscCall(MatCreateMPIAIJWithSeqAIJ(comm, M_size, N_size, Asub, Bsub, garray_compact, &M));
3310 
3311     /* If Bsub has empty columns, compress iscol_o such that it will retrieve condensed Bsub from a->B during reuse */
3312     asub = (Mat_MPIAIJ *)M->data;
3313 
3314     PetscCall(ISGetLocalSize(iscol_o, &BsubN));
3315     n = asub->B->cmap->N;
3316     if (BsubN > n) {
3317       /* This case can be tested using ~petsc/src/tao/bound/tutorials/runplate2_3 */
3318       const PetscInt *idx;
3319       PetscInt        i, j, *idx_new, *subgarray = asub->garray;
3320       PetscCall(PetscInfo(M, "submatrix Bn %" PetscInt_FMT " != BsubN %" PetscInt_FMT ", update iscol_o\n", n, BsubN));
3321 
3322       PetscCall(PetscMalloc1(n, &idx_new));
3323       j = 0;
3324       PetscCall(ISGetIndices(iscol_o, &idx));
3325       for (i = 0; i < n; i++) {
3326         if (j >= BsubN) break;
3327         while (subgarray[i] > garray[j]) j++;
3328 
3329         if (subgarray[i] == garray[j]) {
3330           idx_new[i] = idx[j++];
3331         } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "subgarray[%" PetscInt_FMT "]=%" PetscInt_FMT " cannot < garray[%" PetscInt_FMT "]=%" PetscInt_FMT, i, subgarray[i], j, garray[j]);
3332       }
3333       PetscCall(ISRestoreIndices(iscol_o, &idx));
3334 
3335       PetscCall(ISDestroy(&iscol_o));
3336       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, idx_new, PETSC_OWN_POINTER, &iscol_o));
3337 
3338     } else if (BsubN < n) {
3339       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Columns of Bsub (%" PetscInt_FMT ") cannot be smaller than B's (%" PetscInt_FMT ")", BsubN, asub->B->cmap->N);
3340     }
3341 
3342     PetscCall(PetscFree(garray));
3343     *submat = M;
3344 
3345     /* Save isrow_d, iscol_d and iscol_o used in processor for next request */
3346     PetscCall(PetscObjectCompose((PetscObject)M, "isrow_d", (PetscObject)isrow_d));
3347     PetscCall(ISDestroy(&isrow_d));
3348 
3349     PetscCall(PetscObjectCompose((PetscObject)M, "iscol_d", (PetscObject)iscol_d));
3350     PetscCall(ISDestroy(&iscol_d));
3351 
3352     PetscCall(PetscObjectCompose((PetscObject)M, "iscol_o", (PetscObject)iscol_o));
3353     PetscCall(ISDestroy(&iscol_o));
3354   }
3355   PetscFunctionReturn(PETSC_SUCCESS);
3356 }
3357 
3358 PetscErrorCode MatCreateSubMatrix_MPIAIJ(Mat mat, IS isrow, IS iscol, MatReuse call, Mat *newmat)
3359 {
3360   IS        iscol_local = NULL, isrow_d;
3361   PetscInt  csize;
3362   PetscInt  n, i, j, start, end;
3363   PetscBool sameRowDist = PETSC_FALSE, sameDist[2], tsameDist[2];
3364   MPI_Comm  comm;
3365 
3366   PetscFunctionBegin;
3367   /* If isrow has same processor distribution as mat,
3368      call MatCreateSubMatrix_MPIAIJ_SameRowDist() to avoid using a hash table with global size of iscol */
3369   if (call == MAT_REUSE_MATRIX) {
3370     PetscCall(PetscObjectQuery((PetscObject)*newmat, "isrow_d", (PetscObject *)&isrow_d));
3371     if (isrow_d) {
3372       sameRowDist  = PETSC_TRUE;
3373       tsameDist[1] = PETSC_TRUE; /* sameColDist */
3374     } else {
3375       PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_local));
3376       if (iscol_local) {
3377         sameRowDist  = PETSC_TRUE;
3378         tsameDist[1] = PETSC_FALSE; /* !sameColDist */
3379       }
3380     }
3381   } else {
3382     /* Check if isrow has same processor distribution as mat */
3383     sameDist[0] = PETSC_FALSE;
3384     PetscCall(ISGetLocalSize(isrow, &n));
3385     if (!n) {
3386       sameDist[0] = PETSC_TRUE;
3387     } else {
3388       PetscCall(ISGetMinMax(isrow, &i, &j));
3389       PetscCall(MatGetOwnershipRange(mat, &start, &end));
3390       if (i >= start && j < end) sameDist[0] = PETSC_TRUE;
3391     }
3392 
3393     /* Check if iscol has same processor distribution as mat */
3394     sameDist[1] = PETSC_FALSE;
3395     PetscCall(ISGetLocalSize(iscol, &n));
3396     if (!n) {
3397       sameDist[1] = PETSC_TRUE;
3398     } else {
3399       PetscCall(ISGetMinMax(iscol, &i, &j));
3400       PetscCall(MatGetOwnershipRangeColumn(mat, &start, &end));
3401       if (i >= start && j < end) sameDist[1] = PETSC_TRUE;
3402     }
3403 
3404     PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3405     PetscCallMPI(MPIU_Allreduce(&sameDist, &tsameDist, 2, MPIU_BOOL, MPI_LAND, comm));
3406     sameRowDist = tsameDist[0];
3407   }
3408 
3409   if (sameRowDist) {
3410     if (tsameDist[1]) { /* sameRowDist & sameColDist */
3411       /* isrow and iscol have same processor distribution as mat */
3412       PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowColDist(mat, isrow, iscol, call, newmat));
3413       PetscFunctionReturn(PETSC_SUCCESS);
3414     } else { /* sameRowDist */
3415       /* isrow has same processor distribution as mat */
3416       if (call == MAT_INITIAL_MATRIX) {
3417         PetscBool sorted;
3418         PetscCall(ISGetSeqIS_Private(mat, iscol, &iscol_local));
3419         PetscCall(ISGetLocalSize(iscol_local, &n)); /* local size of iscol_local = global columns of newmat */
3420         PetscCall(ISGetSize(iscol, &i));
3421         PetscCheck(n == i, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "n %" PetscInt_FMT " != size of iscol %" PetscInt_FMT, n, i);
3422 
3423         PetscCall(ISSorted(iscol_local, &sorted));
3424         if (sorted) {
3425           /* MatCreateSubMatrix_MPIAIJ_SameRowDist() requires iscol_local be sorted; it can have duplicate indices */
3426           PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowDist(mat, isrow, iscol, iscol_local, MAT_INITIAL_MATRIX, newmat));
3427           PetscFunctionReturn(PETSC_SUCCESS);
3428         }
3429       } else { /* call == MAT_REUSE_MATRIX */
3430         IS iscol_sub;
3431         PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_sub));
3432         if (iscol_sub) {
3433           PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowDist(mat, isrow, iscol, NULL, call, newmat));
3434           PetscFunctionReturn(PETSC_SUCCESS);
3435         }
3436       }
3437     }
3438   }
3439 
3440   /* General case: iscol -> iscol_local which has global size of iscol */
3441   if (call == MAT_REUSE_MATRIX) {
3442     PetscCall(PetscObjectQuery((PetscObject)*newmat, "ISAllGather", (PetscObject *)&iscol_local));
3443     PetscCheck(iscol_local, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3444   } else {
3445     if (!iscol_local) PetscCall(ISGetSeqIS_Private(mat, iscol, &iscol_local));
3446   }
3447 
3448   PetscCall(ISGetLocalSize(iscol, &csize));
3449   PetscCall(MatCreateSubMatrix_MPIAIJ_nonscalable(mat, isrow, iscol_local, csize, call, newmat));
3450 
3451   if (call == MAT_INITIAL_MATRIX) {
3452     PetscCall(PetscObjectCompose((PetscObject)*newmat, "ISAllGather", (PetscObject)iscol_local));
3453     PetscCall(ISDestroy(&iscol_local));
3454   }
3455   PetscFunctionReturn(PETSC_SUCCESS);
3456 }
3457 
3458 /*@C
3459   MatCreateMPIAIJWithSeqAIJ - creates a `MATMPIAIJ` matrix using `MATSEQAIJ` matrices that contain the "diagonal"
3460   and "off-diagonal" part of the matrix in CSR format.
3461 
3462   Collective
3463 
3464   Input Parameters:
3465 + comm   - MPI communicator
3466 . M      - the global row size
3467 . N      - the global column size
3468 . A      - "diagonal" portion of matrix
3469 . B      - if garray is `NULL`, B should be the offdiag matrix using global col ids and of size N - if garray is not `NULL`, B should be the offdiag matrix using local col ids and of size garray
3470 - garray - either `NULL` or the global index of `B` columns. If not `NULL`, it should be allocated by `PetscMalloc1()` and will be owned by `mat` thereafter.
3471 
3472   Output Parameter:
3473 . mat - the matrix, with input `A` as its local diagonal matrix
3474 
3475   Level: advanced
3476 
3477   Notes:
3478   See `MatCreateAIJ()` for the definition of "diagonal" and "off-diagonal" portion of the matrix.
3479 
3480   `A` and `B` becomes part of output mat. The user cannot use `A` and `B` anymore.
3481 
3482   If `garray` is `NULL`, `B` will be compacted to use local indices. In this sense, `B`'s sparsity pattern (nonzerostate) will be changed. If `B` is a device matrix, we need to somehow also update
3483   `B`'s copy on device.  We do so by increasing `B`'s nonzerostate. In use of `B` on device, device matrix types should detect this change (ref. internal routines `MatSeqAIJCUSPARSECopyToGPU()` or
3484   `MatAssemblyEnd_SeqAIJKokkos()`) and will just destroy and then recreate the device copy of `B`. It is not optimal, but is easy to implement and less hacky. To avoid this overhead, try to compute `garray`
3485   yourself, see algorithms in the private function `MatSetUpMultiply_MPIAIJ()`.
3486 
3487   The `NULL`-ness of `garray` doesn't need to be collective, in other words, `garray` can be `NULL` on some processes while not on others.
3488 
3489 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MATSEQAIJ`, `MatCreateMPIAIJWithSplitArrays()`
3490 @*/
3491 PetscErrorCode MatCreateMPIAIJWithSeqAIJ(MPI_Comm comm, PetscInt M, PetscInt N, Mat A, Mat B, PetscInt *garray, Mat *mat)
3492 {
3493   PetscInt    m, n;
3494   MatType     mpi_mat_type;
3495   Mat_MPIAIJ *mpiaij;
3496   Mat         C;
3497 
3498   PetscFunctionBegin;
3499   PetscCall(MatCreate(comm, &C));
3500   PetscCall(MatGetSize(A, &m, &n));
3501   PetscCheck(m == B->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Am %" PetscInt_FMT " != Bm %" PetscInt_FMT, m, B->rmap->N);
3502   PetscCheck(A->rmap->bs == B->rmap->bs, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "A row bs %" PetscInt_FMT " != B row bs %" PetscInt_FMT, A->rmap->bs, B->rmap->bs);
3503 
3504   PetscCall(MatSetSizes(C, m, n, M, N));
3505   /* Determine the type of MPI matrix that should be created from the type of matrix A, which holds the "diagonal" portion. */
3506   PetscCall(MatGetMPIMatType_Private(A, &mpi_mat_type));
3507   PetscCall(MatSetType(C, mpi_mat_type));
3508   if (!garray) {
3509     const PetscScalar *ba;
3510 
3511     B->nonzerostate++;
3512     PetscCall(MatSeqAIJGetArrayRead(B, &ba)); /* Since we will destroy B's device copy, we need to make sure the host copy is up to date */
3513     PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
3514   }
3515 
3516   PetscCall(MatSetBlockSizes(C, A->rmap->bs, A->cmap->bs));
3517   PetscCall(PetscLayoutSetUp(C->rmap));
3518   PetscCall(PetscLayoutSetUp(C->cmap));
3519 
3520   mpiaij              = (Mat_MPIAIJ *)C->data;
3521   mpiaij->A           = A;
3522   mpiaij->B           = B;
3523   mpiaij->garray      = garray;
3524   C->preallocated     = PETSC_TRUE;
3525   C->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */
3526 
3527   PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
3528   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
3529   /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and
3530    also gets mpiaij->B compacted (if garray is NULL), with its col ids and size reduced
3531    */
3532   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
3533   PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE));
3534   PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3535   *mat = C;
3536   PetscFunctionReturn(PETSC_SUCCESS);
3537 }
3538 
3539 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, PetscBool, Mat *);
3540 
3541 PetscErrorCode MatCreateSubMatrix_MPIAIJ_SameRowDist(Mat mat, IS isrow, IS iscol, IS iscol_local, MatReuse call, Mat *newmat)
3542 {
3543   PetscInt        i, m, n, rstart, row, rend, nz, j, bs, cbs;
3544   PetscInt       *ii, *jj, nlocal, *dlens, *olens, dlen, olen, jend, mglobal;
3545   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)mat->data;
3546   Mat             M, Msub, B = a->B;
3547   MatScalar      *aa;
3548   Mat_SeqAIJ     *aij;
3549   PetscInt       *garray = a->garray, *colsub, Ncols;
3550   PetscInt        count, Bn = B->cmap->N, cstart = mat->cmap->rstart, cend = mat->cmap->rend;
3551   IS              iscol_sub, iscmap;
3552   const PetscInt *is_idx, *cmap;
3553   PetscBool       allcolumns = PETSC_FALSE;
3554   MPI_Comm        comm;
3555 
3556   PetscFunctionBegin;
3557   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3558   if (call == MAT_REUSE_MATRIX) {
3559     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_sub));
3560     PetscCheck(iscol_sub, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "SubIScol passed in was not used before, cannot reuse");
3561     PetscCall(ISGetLocalSize(iscol_sub, &count));
3562 
3563     PetscCall(PetscObjectQuery((PetscObject)*newmat, "Subcmap", (PetscObject *)&iscmap));
3564     PetscCheck(iscmap, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Subcmap passed in was not used before, cannot reuse");
3565 
3566     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubMatrix", (PetscObject *)&Msub));
3567     PetscCheck(Msub, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3568 
3569     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol_sub, MAT_REUSE_MATRIX, PETSC_FALSE, &Msub));
3570 
3571   } else { /* call == MAT_INITIAL_MATRIX) */
3572     PetscBool flg;
3573 
3574     PetscCall(ISGetLocalSize(iscol, &n));
3575     PetscCall(ISGetSize(iscol, &Ncols));
3576 
3577     /* (1) iscol -> nonscalable iscol_local */
3578     /* Check for special case: each processor gets entire matrix columns */
3579     PetscCall(ISIdentity(iscol_local, &flg));
3580     if (flg && n == mat->cmap->N) allcolumns = PETSC_TRUE;
3581     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &allcolumns, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
3582     if (allcolumns) {
3583       iscol_sub = iscol_local;
3584       PetscCall(PetscObjectReference((PetscObject)iscol_local));
3585       PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &iscmap));
3586 
3587     } else {
3588       /* (2) iscol_local -> iscol_sub and iscmap. Implementation below requires iscol_local be sorted, it can have duplicate indices */
3589       PetscInt *idx, *cmap1, k;
3590       PetscCall(PetscMalloc1(Ncols, &idx));
3591       PetscCall(PetscMalloc1(Ncols, &cmap1));
3592       PetscCall(ISGetIndices(iscol_local, &is_idx));
3593       count = 0;
3594       k     = 0;
3595       for (i = 0; i < Ncols; i++) {
3596         j = is_idx[i];
3597         if (j >= cstart && j < cend) {
3598           /* diagonal part of mat */
3599           idx[count]     = j;
3600           cmap1[count++] = i; /* column index in submat */
3601         } else if (Bn) {
3602           /* off-diagonal part of mat */
3603           if (j == garray[k]) {
3604             idx[count]     = j;
3605             cmap1[count++] = i; /* column index in submat */
3606           } else if (j > garray[k]) {
3607             while (j > garray[k] && k < Bn - 1) k++;
3608             if (j == garray[k]) {
3609               idx[count]     = j;
3610               cmap1[count++] = i; /* column index in submat */
3611             }
3612           }
3613         }
3614       }
3615       PetscCall(ISRestoreIndices(iscol_local, &is_idx));
3616 
3617       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, idx, PETSC_OWN_POINTER, &iscol_sub));
3618       PetscCall(ISGetBlockSize(iscol, &cbs));
3619       PetscCall(ISSetBlockSize(iscol_sub, cbs));
3620 
3621       PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)iscol_local), count, cmap1, PETSC_OWN_POINTER, &iscmap));
3622     }
3623 
3624     /* (3) Create sequential Msub */
3625     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol_sub, MAT_INITIAL_MATRIX, allcolumns, &Msub));
3626   }
3627 
3628   PetscCall(ISGetLocalSize(iscol_sub, &count));
3629   aij = (Mat_SeqAIJ *)Msub->data;
3630   ii  = aij->i;
3631   PetscCall(ISGetIndices(iscmap, &cmap));
3632 
3633   /*
3634       m - number of local rows
3635       Ncols - number of columns (same on all processors)
3636       rstart - first row in new global matrix generated
3637   */
3638   PetscCall(MatGetSize(Msub, &m, NULL));
3639 
3640   if (call == MAT_INITIAL_MATRIX) {
3641     /* (4) Create parallel newmat */
3642     PetscMPIInt rank, size;
3643     PetscInt    csize;
3644 
3645     PetscCallMPI(MPI_Comm_size(comm, &size));
3646     PetscCallMPI(MPI_Comm_rank(comm, &rank));
3647 
3648     /*
3649         Determine the number of non-zeros in the diagonal and off-diagonal
3650         portions of the matrix in order to do correct preallocation
3651     */
3652 
3653     /* first get start and end of "diagonal" columns */
3654     PetscCall(ISGetLocalSize(iscol, &csize));
3655     if (csize == PETSC_DECIDE) {
3656       PetscCall(ISGetSize(isrow, &mglobal));
3657       if (mglobal == Ncols) { /* square matrix */
3658         nlocal = m;
3659       } else {
3660         nlocal = Ncols / size + ((Ncols % size) > rank);
3661       }
3662     } else {
3663       nlocal = csize;
3664     }
3665     PetscCallMPI(MPI_Scan(&nlocal, &rend, 1, MPIU_INT, MPI_SUM, comm));
3666     rstart = rend - nlocal;
3667     PetscCheck(rank != size - 1 || rend == Ncols, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Local column sizes %" PetscInt_FMT " do not add up to total number of columns %" PetscInt_FMT, rend, Ncols);
3668 
3669     /* next, compute all the lengths */
3670     jj = aij->j;
3671     PetscCall(PetscMalloc1(2 * m + 1, &dlens));
3672     olens = dlens + m;
3673     for (i = 0; i < m; i++) {
3674       jend = ii[i + 1] - ii[i];
3675       olen = 0;
3676       dlen = 0;
3677       for (j = 0; j < jend; j++) {
3678         if (cmap[*jj] < rstart || cmap[*jj] >= rend) olen++;
3679         else dlen++;
3680         jj++;
3681       }
3682       olens[i] = olen;
3683       dlens[i] = dlen;
3684     }
3685 
3686     PetscCall(ISGetBlockSize(isrow, &bs));
3687     PetscCall(ISGetBlockSize(iscol, &cbs));
3688 
3689     PetscCall(MatCreate(comm, &M));
3690     PetscCall(MatSetSizes(M, m, nlocal, PETSC_DECIDE, Ncols));
3691     PetscCall(MatSetBlockSizes(M, bs, cbs));
3692     PetscCall(MatSetType(M, ((PetscObject)mat)->type_name));
3693     PetscCall(MatMPIAIJSetPreallocation(M, 0, dlens, 0, olens));
3694     PetscCall(PetscFree(dlens));
3695 
3696   } else { /* call == MAT_REUSE_MATRIX */
3697     M = *newmat;
3698     PetscCall(MatGetLocalSize(M, &i, NULL));
3699     PetscCheck(i == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Previous matrix must be same size/layout as request");
3700     PetscCall(MatZeroEntries(M));
3701     /*
3702          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
3703        rather than the slower MatSetValues().
3704     */
3705     M->was_assembled = PETSC_TRUE;
3706     M->assembled     = PETSC_FALSE;
3707   }
3708 
3709   /* (5) Set values of Msub to *newmat */
3710   PetscCall(PetscMalloc1(count, &colsub));
3711   PetscCall(MatGetOwnershipRange(M, &rstart, NULL));
3712 
3713   jj = aij->j;
3714   PetscCall(MatSeqAIJGetArrayRead(Msub, (const PetscScalar **)&aa));
3715   for (i = 0; i < m; i++) {
3716     row = rstart + i;
3717     nz  = ii[i + 1] - ii[i];
3718     for (j = 0; j < nz; j++) colsub[j] = cmap[jj[j]];
3719     PetscCall(MatSetValues_MPIAIJ(M, 1, &row, nz, colsub, aa, INSERT_VALUES));
3720     jj += nz;
3721     aa += nz;
3722   }
3723   PetscCall(MatSeqAIJRestoreArrayRead(Msub, (const PetscScalar **)&aa));
3724   PetscCall(ISRestoreIndices(iscmap, &cmap));
3725 
3726   PetscCall(MatAssemblyBegin(M, MAT_FINAL_ASSEMBLY));
3727   PetscCall(MatAssemblyEnd(M, MAT_FINAL_ASSEMBLY));
3728 
3729   PetscCall(PetscFree(colsub));
3730 
3731   /* save Msub, iscol_sub and iscmap used in processor for next request */
3732   if (call == MAT_INITIAL_MATRIX) {
3733     *newmat = M;
3734     PetscCall(PetscObjectCompose((PetscObject)*newmat, "SubMatrix", (PetscObject)Msub));
3735     PetscCall(MatDestroy(&Msub));
3736 
3737     PetscCall(PetscObjectCompose((PetscObject)*newmat, "SubIScol", (PetscObject)iscol_sub));
3738     PetscCall(ISDestroy(&iscol_sub));
3739 
3740     PetscCall(PetscObjectCompose((PetscObject)*newmat, "Subcmap", (PetscObject)iscmap));
3741     PetscCall(ISDestroy(&iscmap));
3742 
3743     if (iscol_local) {
3744       PetscCall(PetscObjectCompose((PetscObject)*newmat, "ISAllGather", (PetscObject)iscol_local));
3745       PetscCall(ISDestroy(&iscol_local));
3746     }
3747   }
3748   PetscFunctionReturn(PETSC_SUCCESS);
3749 }
3750 
3751 /*
3752     Not great since it makes two copies of the submatrix, first an SeqAIJ
3753   in local and then by concatenating the local matrices the end result.
3754   Writing it directly would be much like MatCreateSubMatrices_MPIAIJ()
3755 
3756   This requires a sequential iscol with all indices.
3757 */
3758 PetscErrorCode MatCreateSubMatrix_MPIAIJ_nonscalable(Mat mat, IS isrow, IS iscol, PetscInt csize, MatReuse call, Mat *newmat)
3759 {
3760   PetscMPIInt rank, size;
3761   PetscInt    i, m, n, rstart, row, rend, nz, *cwork, j, bs, cbs;
3762   PetscInt   *ii, *jj, nlocal, *dlens, *olens, dlen, olen, jend, mglobal;
3763   Mat         M, Mreuse;
3764   MatScalar  *aa, *vwork;
3765   MPI_Comm    comm;
3766   Mat_SeqAIJ *aij;
3767   PetscBool   colflag, allcolumns = PETSC_FALSE;
3768 
3769   PetscFunctionBegin;
3770   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3771   PetscCallMPI(MPI_Comm_rank(comm, &rank));
3772   PetscCallMPI(MPI_Comm_size(comm, &size));
3773 
3774   /* Check for special case: each processor gets entire matrix columns */
3775   PetscCall(ISIdentity(iscol, &colflag));
3776   PetscCall(ISGetLocalSize(iscol, &n));
3777   if (colflag && n == mat->cmap->N) allcolumns = PETSC_TRUE;
3778   PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &allcolumns, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
3779 
3780   if (call == MAT_REUSE_MATRIX) {
3781     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubMatrix", (PetscObject *)&Mreuse));
3782     PetscCheck(Mreuse, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3783     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol, MAT_REUSE_MATRIX, allcolumns, &Mreuse));
3784   } else {
3785     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol, MAT_INITIAL_MATRIX, allcolumns, &Mreuse));
3786   }
3787 
3788   /*
3789       m - number of local rows
3790       n - number of columns (same on all processors)
3791       rstart - first row in new global matrix generated
3792   */
3793   PetscCall(MatGetSize(Mreuse, &m, &n));
3794   PetscCall(MatGetBlockSizes(Mreuse, &bs, &cbs));
3795   if (call == MAT_INITIAL_MATRIX) {
3796     aij = (Mat_SeqAIJ *)Mreuse->data;
3797     ii  = aij->i;
3798     jj  = aij->j;
3799 
3800     /*
3801         Determine the number of non-zeros in the diagonal and off-diagonal
3802         portions of the matrix in order to do correct preallocation
3803     */
3804 
3805     /* first get start and end of "diagonal" columns */
3806     if (csize == PETSC_DECIDE) {
3807       PetscCall(ISGetSize(isrow, &mglobal));
3808       if (mglobal == n) { /* square matrix */
3809         nlocal = m;
3810       } else {
3811         nlocal = n / size + ((n % size) > rank);
3812       }
3813     } else {
3814       nlocal = csize;
3815     }
3816     PetscCallMPI(MPI_Scan(&nlocal, &rend, 1, MPIU_INT, MPI_SUM, comm));
3817     rstart = rend - nlocal;
3818     PetscCheck(rank != size - 1 || rend == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Local column sizes %" PetscInt_FMT " do not add up to total number of columns %" PetscInt_FMT, rend, n);
3819 
3820     /* next, compute all the lengths */
3821     PetscCall(PetscMalloc1(2 * m + 1, &dlens));
3822     olens = dlens + m;
3823     for (i = 0; i < m; i++) {
3824       jend = ii[i + 1] - ii[i];
3825       olen = 0;
3826       dlen = 0;
3827       for (j = 0; j < jend; j++) {
3828         if (*jj < rstart || *jj >= rend) olen++;
3829         else dlen++;
3830         jj++;
3831       }
3832       olens[i] = olen;
3833       dlens[i] = dlen;
3834     }
3835     PetscCall(MatCreate(comm, &M));
3836     PetscCall(MatSetSizes(M, m, nlocal, PETSC_DECIDE, n));
3837     PetscCall(MatSetBlockSizes(M, bs, cbs));
3838     PetscCall(MatSetType(M, ((PetscObject)mat)->type_name));
3839     PetscCall(MatMPIAIJSetPreallocation(M, 0, dlens, 0, olens));
3840     PetscCall(PetscFree(dlens));
3841   } else {
3842     PetscInt ml, nl;
3843 
3844     M = *newmat;
3845     PetscCall(MatGetLocalSize(M, &ml, &nl));
3846     PetscCheck(ml == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Previous matrix must be same size/layout as request");
3847     PetscCall(MatZeroEntries(M));
3848     /*
3849          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
3850        rather than the slower MatSetValues().
3851     */
3852     M->was_assembled = PETSC_TRUE;
3853     M->assembled     = PETSC_FALSE;
3854   }
3855   PetscCall(MatGetOwnershipRange(M, &rstart, &rend));
3856   aij = (Mat_SeqAIJ *)Mreuse->data;
3857   ii  = aij->i;
3858   jj  = aij->j;
3859 
3860   /* trigger copy to CPU if needed */
3861   PetscCall(MatSeqAIJGetArrayRead(Mreuse, (const PetscScalar **)&aa));
3862   for (i = 0; i < m; i++) {
3863     row   = rstart + i;
3864     nz    = ii[i + 1] - ii[i];
3865     cwork = jj;
3866     jj    = PetscSafePointerPlusOffset(jj, nz);
3867     vwork = aa;
3868     aa    = PetscSafePointerPlusOffset(aa, nz);
3869     PetscCall(MatSetValues_MPIAIJ(M, 1, &row, nz, cwork, vwork, INSERT_VALUES));
3870   }
3871   PetscCall(MatSeqAIJRestoreArrayRead(Mreuse, (const PetscScalar **)&aa));
3872 
3873   PetscCall(MatAssemblyBegin(M, MAT_FINAL_ASSEMBLY));
3874   PetscCall(MatAssemblyEnd(M, MAT_FINAL_ASSEMBLY));
3875   *newmat = M;
3876 
3877   /* save submatrix used in processor for next request */
3878   if (call == MAT_INITIAL_MATRIX) {
3879     PetscCall(PetscObjectCompose((PetscObject)M, "SubMatrix", (PetscObject)Mreuse));
3880     PetscCall(MatDestroy(&Mreuse));
3881   }
3882   PetscFunctionReturn(PETSC_SUCCESS);
3883 }
3884 
3885 static PetscErrorCode MatMPIAIJSetPreallocationCSR_MPIAIJ(Mat B, const PetscInt Ii[], const PetscInt J[], const PetscScalar v[])
3886 {
3887   PetscInt        m, cstart, cend, j, nnz, i, d, *ld;
3888   PetscInt       *d_nnz, *o_nnz, nnz_max = 0, rstart, ii, irstart;
3889   const PetscInt *JJ;
3890   PetscBool       nooffprocentries;
3891   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)B->data;
3892 
3893   PetscFunctionBegin;
3894   PetscCall(PetscLayoutSetUp(B->rmap));
3895   PetscCall(PetscLayoutSetUp(B->cmap));
3896   m       = B->rmap->n;
3897   cstart  = B->cmap->rstart;
3898   cend    = B->cmap->rend;
3899   rstart  = B->rmap->rstart;
3900   irstart = Ii[0];
3901 
3902   PetscCall(PetscCalloc2(m, &d_nnz, m, &o_nnz));
3903 
3904   if (PetscDefined(USE_DEBUG)) {
3905     for (i = 0; i < m; i++) {
3906       nnz = Ii[i + 1] - Ii[i];
3907       JJ  = PetscSafePointerPlusOffset(J, Ii[i] - irstart);
3908       PetscCheck(nnz >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Local row %" PetscInt_FMT " has a negative %" PetscInt_FMT " number of columns", i, nnz);
3909       PetscCheck(!nnz || !(JJ[0] < 0), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Row %" PetscInt_FMT " starts with negative column index %" PetscInt_FMT, i, JJ[0]);
3910       PetscCheck(!nnz || !(JJ[nnz - 1] >= B->cmap->N), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Row %" PetscInt_FMT " ends with too large a column index %" PetscInt_FMT " (max allowed %" PetscInt_FMT ")", i, JJ[nnz - 1], B->cmap->N);
3911     }
3912   }
3913 
3914   for (i = 0; i < m; i++) {
3915     nnz     = Ii[i + 1] - Ii[i];
3916     JJ      = PetscSafePointerPlusOffset(J, Ii[i] - irstart);
3917     nnz_max = PetscMax(nnz_max, nnz);
3918     d       = 0;
3919     for (j = 0; j < nnz; j++) {
3920       if (cstart <= JJ[j] && JJ[j] < cend) d++;
3921     }
3922     d_nnz[i] = d;
3923     o_nnz[i] = nnz - d;
3924   }
3925   PetscCall(MatMPIAIJSetPreallocation(B, 0, d_nnz, 0, o_nnz));
3926   PetscCall(PetscFree2(d_nnz, o_nnz));
3927 
3928   for (i = 0; i < m; i++) {
3929     ii = i + rstart;
3930     PetscCall(MatSetValues_MPIAIJ(B, 1, &ii, Ii[i + 1] - Ii[i], PetscSafePointerPlusOffset(J, Ii[i] - irstart), PetscSafePointerPlusOffset(v, Ii[i] - irstart), INSERT_VALUES));
3931   }
3932   nooffprocentries    = B->nooffprocentries;
3933   B->nooffprocentries = PETSC_TRUE;
3934   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
3935   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
3936   B->nooffprocentries = nooffprocentries;
3937 
3938   /* count number of entries below block diagonal */
3939   PetscCall(PetscFree(Aij->ld));
3940   PetscCall(PetscCalloc1(m, &ld));
3941   Aij->ld = ld;
3942   for (i = 0; i < m; i++) {
3943     nnz = Ii[i + 1] - Ii[i];
3944     j   = 0;
3945     while (j < nnz && J[j] < cstart) j++;
3946     ld[i] = j;
3947     if (J) J += nnz;
3948   }
3949 
3950   PetscCall(MatSetOption(B, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3951   PetscFunctionReturn(PETSC_SUCCESS);
3952 }
3953 
3954 /*@
3955   MatMPIAIJSetPreallocationCSR - Allocates memory for a sparse parallel matrix in `MATAIJ` format
3956   (the default parallel PETSc format).
3957 
3958   Collective
3959 
3960   Input Parameters:
3961 + B - the matrix
3962 . i - the indices into `j` for the start of each local row (indices start with zero)
3963 . j - the column indices for each local row (indices start with zero)
3964 - v - optional values in the matrix
3965 
3966   Level: developer
3967 
3968   Notes:
3969   The `i`, `j`, and `v` arrays ARE copied by this routine into the internal format used by PETSc;
3970   thus you CANNOT change the matrix entries by changing the values of `v` after you have
3971   called this routine. Use `MatCreateMPIAIJWithSplitArrays()` to avoid needing to copy the arrays.
3972 
3973   The `i` and `j` indices are 0 based, and `i` indices are indices corresponding to the local `j` array.
3974 
3975   A convenience routine for this functionality is `MatCreateMPIAIJWithArrays()`.
3976 
3977   You can update the matrix with new numerical values using `MatUpdateMPIAIJWithArrays()` after this call if the column indices in `j` are sorted.
3978 
3979   If you do **not** use `MatUpdateMPIAIJWithArrays()`, the column indices in `j` do not need to be sorted. If you will use
3980   `MatUpdateMPIAIJWithArrays()`, the column indices **must** be sorted.
3981 
3982   The format which is used for the sparse matrix input, is equivalent to a
3983   row-major ordering.. i.e for the following matrix, the input data expected is
3984   as shown
3985 .vb
3986         1 0 0
3987         2 0 3     P0
3988        -------
3989         4 5 6     P1
3990 
3991      Process0 [P0] rows_owned=[0,1]
3992         i =  {0,1,3}  [size = nrow+1  = 2+1]
3993         j =  {0,0,2}  [size = 3]
3994         v =  {1,2,3}  [size = 3]
3995 
3996      Process1 [P1] rows_owned=[2]
3997         i =  {0,3}    [size = nrow+1  = 1+1]
3998         j =  {0,1,2}  [size = 3]
3999         v =  {4,5,6}  [size = 3]
4000 .ve
4001 
4002 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatCreateAIJ()`,
4003           `MatCreateSeqAIJWithArrays()`, `MatCreateMPIAIJWithSplitArrays()`, `MatCreateMPIAIJWithArrays()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4004 @*/
4005 PetscErrorCode MatMPIAIJSetPreallocationCSR(Mat B, const PetscInt i[], const PetscInt j[], const PetscScalar v[])
4006 {
4007   PetscFunctionBegin;
4008   PetscTryMethod(B, "MatMPIAIJSetPreallocationCSR_C", (Mat, const PetscInt[], const PetscInt[], const PetscScalar[]), (B, i, j, v));
4009   PetscFunctionReturn(PETSC_SUCCESS);
4010 }
4011 
4012 /*@
4013   MatMPIAIJSetPreallocation - Preallocates memory for a sparse parallel matrix in `MATMPIAIJ` format
4014   (the default parallel PETSc format).  For good matrix assembly performance
4015   the user should preallocate the matrix storage by setting the parameters
4016   `d_nz` (or `d_nnz`) and `o_nz` (or `o_nnz`).
4017 
4018   Collective
4019 
4020   Input Parameters:
4021 + B     - the matrix
4022 . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
4023            (same value is used for all local rows)
4024 . d_nnz - array containing the number of nonzeros in the various rows of the
4025            DIAGONAL portion of the local submatrix (possibly different for each row)
4026            or `NULL` (`PETSC_NULL_INTEGER` in Fortran), if `d_nz` is used to specify the nonzero structure.
4027            The size of this array is equal to the number of local rows, i.e 'm'.
4028            For matrices that will be factored, you must leave room for (and set)
4029            the diagonal entry even if it is zero.
4030 . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
4031            submatrix (same value is used for all local rows).
4032 - o_nnz - array containing the number of nonzeros in the various rows of the
4033            OFF-DIAGONAL portion of the local submatrix (possibly different for
4034            each row) or `NULL` (`PETSC_NULL_INTEGER` in Fortran), if `o_nz` is used to specify the nonzero
4035            structure. The size of this array is equal to the number
4036            of local rows, i.e 'm'.
4037 
4038   Example Usage:
4039   Consider the following 8x8 matrix with 34 non-zero values, that is
4040   assembled across 3 processors. Lets assume that proc0 owns 3 rows,
4041   proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
4042   as follows
4043 
4044 .vb
4045             1  2  0  |  0  3  0  |  0  4
4046     Proc0   0  5  6  |  7  0  0  |  8  0
4047             9  0 10  | 11  0  0  | 12  0
4048     -------------------------------------
4049            13  0 14  | 15 16 17  |  0  0
4050     Proc1   0 18  0  | 19 20 21  |  0  0
4051             0  0  0  | 22 23  0  | 24  0
4052     -------------------------------------
4053     Proc2  25 26 27  |  0  0 28  | 29  0
4054            30  0  0  | 31 32 33  |  0 34
4055 .ve
4056 
4057   This can be represented as a collection of submatrices as
4058 .vb
4059       A B C
4060       D E F
4061       G H I
4062 .ve
4063 
4064   Where the submatrices A,B,C are owned by proc0, D,E,F are
4065   owned by proc1, G,H,I are owned by proc2.
4066 
4067   The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4068   The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4069   The 'M','N' parameters are 8,8, and have the same values on all procs.
4070 
4071   The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
4072   submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
4073   corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
4074   Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
4075   part as `MATSEQAIJ` matrices. For example, proc1 will store [E] as a `MATSEQAIJ`
4076   matrix, and [DF] as another `MATSEQAIJ` matrix.
4077 
4078   When `d_nz`, `o_nz` parameters are specified, `d_nz` storage elements are
4079   allocated for every row of the local DIAGONAL submatrix, and `o_nz`
4080   storage locations are allocated for every row of the OFF-DIAGONAL submatrix.
4081   One way to choose `d_nz` and `o_nz` is to use the maximum number of nonzeros over
4082   the local rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
4083   In this case, the values of `d_nz`, `o_nz` are
4084 .vb
4085      proc0  dnz = 2, o_nz = 2
4086      proc1  dnz = 3, o_nz = 2
4087      proc2  dnz = 1, o_nz = 4
4088 .ve
4089   We are allocating `m`*(`d_nz`+`o_nz`) storage locations for every proc. This
4090   translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
4091   for proc3. i.e we are using 12+15+10=37 storage locations to store
4092   34 values.
4093 
4094   When `d_nnz`, `o_nnz` parameters are specified, the storage is specified
4095   for every row, corresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
4096   In the above case the values for `d_nnz`, `o_nnz` are
4097 .vb
4098      proc0 d_nnz = [2,2,2] and o_nnz = [2,2,2]
4099      proc1 d_nnz = [3,3,2] and o_nnz = [2,1,1]
4100      proc2 d_nnz = [1,1]   and o_nnz = [4,4]
4101 .ve
4102   Here the space allocated is sum of all the above values i.e 34, and
4103   hence pre-allocation is perfect.
4104 
4105   Level: intermediate
4106 
4107   Notes:
4108   If the *_nnz parameter is given then the *_nz parameter is ignored
4109 
4110   The `MATAIJ` format, also called compressed row storage (CSR), is compatible with standard Fortran
4111   storage.  The stored row and column indices begin with zero.
4112   See [Sparse Matrices](sec_matsparse) for details.
4113 
4114   The parallel matrix is partitioned such that the first m0 rows belong to
4115   process 0, the next m1 rows belong to process 1, the next m2 rows belong
4116   to process 2 etc.. where m0,m1,m2... are the input parameter 'm'.
4117 
4118   The DIAGONAL portion of the local submatrix of a processor can be defined
4119   as the submatrix which is obtained by extraction the part corresponding to
4120   the rows r1-r2 and columns c1-c2 of the global matrix, where r1 is the
4121   first row that belongs to the processor, r2 is the last row belonging to
4122   the this processor, and c1-c2 is range of indices of the local part of a
4123   vector suitable for applying the matrix to.  This is an mxn matrix.  In the
4124   common case of a square matrix, the row and column ranges are the same and
4125   the DIAGONAL part is also square. The remaining portion of the local
4126   submatrix (mxN) constitute the OFF-DIAGONAL portion.
4127 
4128   If `o_nnz` and `d_nnz` are specified, then `o_nz` and `d_nz` are ignored.
4129 
4130   You can call `MatGetInfo()` to get information on how effective the preallocation was;
4131   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
4132   You can also run with the option `-info` and look for messages with the string
4133   malloc in them to see if additional memory allocation was needed.
4134 
4135 .seealso: [](ch_matrices), `Mat`, [Sparse Matrices](sec_matsparse), `MATMPIAIJ`, `MATAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateAIJ()`, `MatMPIAIJSetPreallocationCSR()`,
4136           `MatGetInfo()`, `PetscSplitOwnership()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4137 @*/
4138 PetscErrorCode MatMPIAIJSetPreallocation(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
4139 {
4140   PetscFunctionBegin;
4141   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
4142   PetscValidType(B, 1);
4143   PetscTryMethod(B, "MatMPIAIJSetPreallocation_C", (Mat, PetscInt, const PetscInt[], PetscInt, const PetscInt[]), (B, d_nz, d_nnz, o_nz, o_nnz));
4144   PetscFunctionReturn(PETSC_SUCCESS);
4145 }
4146 
4147 /*@
4148   MatCreateMPIAIJWithArrays - creates a `MATMPIAIJ` matrix using arrays that contain in standard
4149   CSR format for the local rows.
4150 
4151   Collective
4152 
4153   Input Parameters:
4154 + comm - MPI communicator
4155 . m    - number of local rows (Cannot be `PETSC_DECIDE`)
4156 . n    - This value should be the same as the local size used in creating the
4157          x vector for the matrix-vector product $ y = Ax$. (or `PETSC_DECIDE` to have
4158          calculated if `N` is given) For square matrices n is almost always `m`.
4159 . M    - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
4160 . N    - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
4161 . i    - row indices (of length m+1); that is i[0] = 0, i[row] = i[row-1] + number of elements in that row of the matrix
4162 . j    - global column indices
4163 - a    - optional matrix values
4164 
4165   Output Parameter:
4166 . mat - the matrix
4167 
4168   Level: intermediate
4169 
4170   Notes:
4171   The `i`, `j`, and `a` arrays ARE copied by this routine into the internal format used by PETSc;
4172   thus you CANNOT change the matrix entries by changing the values of `a[]` after you have
4173   called this routine. Use `MatCreateMPIAIJWithSplitArrays()` to avoid needing to copy the arrays.
4174 
4175   The `i` and `j` indices are 0 based, and `i` indices are indices corresponding to the local `j` array.
4176 
4177   Once you have created the matrix you can update it with new numerical values using `MatUpdateMPIAIJWithArray()`
4178 
4179   If you do **not** use `MatUpdateMPIAIJWithArray()`, the column indices in `j` do not need to be sorted. If you will use
4180   `MatUpdateMPIAIJWithArrays()`, the column indices **must** be sorted.
4181 
4182   The format which is used for the sparse matrix input, is equivalent to a
4183   row-major ordering, i.e., for the following matrix, the input data expected is
4184   as shown
4185 .vb
4186         1 0 0
4187         2 0 3     P0
4188        -------
4189         4 5 6     P1
4190 
4191      Process0 [P0] rows_owned=[0,1]
4192         i =  {0,1,3}  [size = nrow+1  = 2+1]
4193         j =  {0,0,2}  [size = 3]
4194         v =  {1,2,3}  [size = 3]
4195 
4196      Process1 [P1] rows_owned=[2]
4197         i =  {0,3}    [size = nrow+1  = 1+1]
4198         j =  {0,1,2}  [size = 3]
4199         v =  {4,5,6}  [size = 3]
4200 .ve
4201 
4202 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4203           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArray()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4204 @*/
4205 PetscErrorCode MatCreateMPIAIJWithArrays(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, const PetscInt i[], const PetscInt j[], const PetscScalar a[], Mat *mat)
4206 {
4207   PetscFunctionBegin;
4208   PetscCheck(!i || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
4209   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
4210   PetscCall(MatCreate(comm, mat));
4211   PetscCall(MatSetSizes(*mat, m, n, M, N));
4212   /* PetscCall(MatSetBlockSizes(M,bs,cbs)); */
4213   PetscCall(MatSetType(*mat, MATMPIAIJ));
4214   PetscCall(MatMPIAIJSetPreallocationCSR(*mat, i, j, a));
4215   PetscFunctionReturn(PETSC_SUCCESS);
4216 }
4217 
4218 /*@
4219   MatUpdateMPIAIJWithArrays - updates a `MATMPIAIJ` matrix using arrays that contain in standard
4220   CSR format for the local rows. Only the numerical values are updated the other arrays must be identical to what was passed
4221   from `MatCreateMPIAIJWithArrays()`
4222 
4223   Deprecated: Use `MatUpdateMPIAIJWithArray()`
4224 
4225   Collective
4226 
4227   Input Parameters:
4228 + mat - the matrix
4229 . m   - number of local rows (Cannot be `PETSC_DECIDE`)
4230 . n   - This value should be the same as the local size used in creating the
4231        x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
4232        calculated if N is given) For square matrices n is almost always m.
4233 . M   - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
4234 . N   - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
4235 . Ii  - row indices; that is Ii[0] = 0, Ii[row] = Ii[row-1] + number of elements in that row of the matrix
4236 . J   - column indices
4237 - v   - matrix values
4238 
4239   Level: deprecated
4240 
4241 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4242           `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArray()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4243 @*/
4244 PetscErrorCode MatUpdateMPIAIJWithArrays(Mat mat, PetscInt m, PetscInt n, PetscInt M, PetscInt N, const PetscInt Ii[], const PetscInt J[], const PetscScalar v[])
4245 {
4246   PetscInt        nnz, i;
4247   PetscBool       nooffprocentries;
4248   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)mat->data;
4249   Mat_SeqAIJ     *Ad  = (Mat_SeqAIJ *)Aij->A->data;
4250   PetscScalar    *ad, *ao;
4251   PetscInt        ldi, Iii, md;
4252   const PetscInt *Adi = Ad->i;
4253   PetscInt       *ld  = Aij->ld;
4254 
4255   PetscFunctionBegin;
4256   PetscCheck(Ii[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
4257   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
4258   PetscCheck(m == mat->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Local number of rows cannot change from call to MatUpdateMPIAIJWithArrays()");
4259   PetscCheck(n == mat->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Local number of columns cannot change from call to MatUpdateMPIAIJWithArrays()");
4260 
4261   PetscCall(MatSeqAIJGetArrayWrite(Aij->A, &ad));
4262   PetscCall(MatSeqAIJGetArrayWrite(Aij->B, &ao));
4263 
4264   for (i = 0; i < m; i++) {
4265     if (PetscDefined(USE_DEBUG)) {
4266       for (PetscInt j = Ii[i] + 1; j < Ii[i + 1]; ++j) {
4267         PetscCheck(J[j] >= J[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column entry number %" PetscInt_FMT " (actual column %" PetscInt_FMT ") in row %" PetscInt_FMT " is not sorted", j - Ii[i], J[j], i);
4268         PetscCheck(J[j] != J[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column entry number %" PetscInt_FMT " (actual column %" PetscInt_FMT ") in row %" PetscInt_FMT " is identical to previous entry", j - Ii[i], J[j], i);
4269       }
4270     }
4271     nnz = Ii[i + 1] - Ii[i];
4272     Iii = Ii[i];
4273     ldi = ld[i];
4274     md  = Adi[i + 1] - Adi[i];
4275     PetscCall(PetscArraycpy(ao, v + Iii, ldi));
4276     PetscCall(PetscArraycpy(ad, v + Iii + ldi, md));
4277     PetscCall(PetscArraycpy(ao + ldi, v + Iii + ldi + md, nnz - ldi - md));
4278     ad += md;
4279     ao += nnz - md;
4280   }
4281   nooffprocentries      = mat->nooffprocentries;
4282   mat->nooffprocentries = PETSC_TRUE;
4283   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->A, &ad));
4284   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->B, &ao));
4285   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->A));
4286   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->B));
4287   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
4288   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
4289   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
4290   mat->nooffprocentries = nooffprocentries;
4291   PetscFunctionReturn(PETSC_SUCCESS);
4292 }
4293 
4294 /*@
4295   MatUpdateMPIAIJWithArray - updates an `MATMPIAIJ` matrix using an array that contains the nonzero values
4296 
4297   Collective
4298 
4299   Input Parameters:
4300 + mat - the matrix
4301 - v   - matrix values, stored by row
4302 
4303   Level: intermediate
4304 
4305   Notes:
4306   The matrix must have been obtained with `MatCreateMPIAIJWithArrays()` or `MatMPIAIJSetPreallocationCSR()`
4307 
4308   The column indices in the call to `MatCreateMPIAIJWithArrays()` or `MatMPIAIJSetPreallocationCSR()` must have been sorted for this call to work correctly
4309 
4310 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4311           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArrays()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4312 @*/
4313 PetscErrorCode MatUpdateMPIAIJWithArray(Mat mat, const PetscScalar v[])
4314 {
4315   PetscInt        nnz, i, m;
4316   PetscBool       nooffprocentries;
4317   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)mat->data;
4318   Mat_SeqAIJ     *Ad  = (Mat_SeqAIJ *)Aij->A->data;
4319   Mat_SeqAIJ     *Ao  = (Mat_SeqAIJ *)Aij->B->data;
4320   PetscScalar    *ad, *ao;
4321   const PetscInt *Adi = Ad->i, *Adj = Ao->i;
4322   PetscInt        ldi, Iii, md;
4323   PetscInt       *ld = Aij->ld;
4324 
4325   PetscFunctionBegin;
4326   m = mat->rmap->n;
4327 
4328   PetscCall(MatSeqAIJGetArrayWrite(Aij->A, &ad));
4329   PetscCall(MatSeqAIJGetArrayWrite(Aij->B, &ao));
4330   Iii = 0;
4331   for (i = 0; i < m; i++) {
4332     nnz = Adi[i + 1] - Adi[i] + Adj[i + 1] - Adj[i];
4333     ldi = ld[i];
4334     md  = Adi[i + 1] - Adi[i];
4335     PetscCall(PetscArraycpy(ad, v + Iii + ldi, md));
4336     ad += md;
4337     if (ao) {
4338       PetscCall(PetscArraycpy(ao, v + Iii, ldi));
4339       PetscCall(PetscArraycpy(ao + ldi, v + Iii + ldi + md, nnz - ldi - md));
4340       ao += nnz - md;
4341     }
4342     Iii += nnz;
4343   }
4344   nooffprocentries      = mat->nooffprocentries;
4345   mat->nooffprocentries = PETSC_TRUE;
4346   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->A, &ad));
4347   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->B, &ao));
4348   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->A));
4349   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->B));
4350   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
4351   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
4352   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
4353   mat->nooffprocentries = nooffprocentries;
4354   PetscFunctionReturn(PETSC_SUCCESS);
4355 }
4356 
4357 /*@
4358   MatCreateAIJ - Creates a sparse parallel matrix in `MATAIJ` format
4359   (the default parallel PETSc format).  For good matrix assembly performance
4360   the user should preallocate the matrix storage by setting the parameters
4361   `d_nz` (or `d_nnz`) and `o_nz` (or `o_nnz`).
4362 
4363   Collective
4364 
4365   Input Parameters:
4366 + comm  - MPI communicator
4367 . m     - number of local rows (or `PETSC_DECIDE` to have calculated if M is given)
4368           This value should be the same as the local size used in creating the
4369           y vector for the matrix-vector product y = Ax.
4370 . n     - This value should be the same as the local size used in creating the
4371           x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
4372           calculated if N is given) For square matrices n is almost always m.
4373 . M     - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
4374 . N     - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
4375 . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
4376           (same value is used for all local rows)
4377 . d_nnz - array containing the number of nonzeros in the various rows of the
4378           DIAGONAL portion of the local submatrix (possibly different for each row)
4379           or `NULL`, if `d_nz` is used to specify the nonzero structure.
4380           The size of this array is equal to the number of local rows, i.e 'm'.
4381 . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
4382           submatrix (same value is used for all local rows).
4383 - o_nnz - array containing the number of nonzeros in the various rows of the
4384           OFF-DIAGONAL portion of the local submatrix (possibly different for
4385           each row) or `NULL`, if `o_nz` is used to specify the nonzero
4386           structure. The size of this array is equal to the number
4387           of local rows, i.e 'm'.
4388 
4389   Output Parameter:
4390 . A - the matrix
4391 
4392   Options Database Keys:
4393 + -mat_no_inode                     - Do not use inodes
4394 . -mat_inode_limit <limit>          - Sets inode limit (max limit=5)
4395 - -matmult_vecscatter_view <viewer> - View the vecscatter (i.e., communication pattern) used in `MatMult()` of sparse parallel matrices.
4396                                       See viewer types in manual of `MatView()`. Of them, ascii_matlab, draw or binary cause the `VecScatter`
4397                                       to be viewed as a matrix. Entry (i,j) is the size of message (in bytes) rank i sends to rank j in one `MatMult()` call.
4398 
4399   Level: intermediate
4400 
4401   Notes:
4402   It is recommended that one use `MatCreateFromOptions()` or the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
4403   MatXXXXSetPreallocation() paradigm instead of this routine directly.
4404   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
4405 
4406   If the *_nnz parameter is given then the *_nz parameter is ignored
4407 
4408   The `m`,`n`,`M`,`N` parameters specify the size of the matrix, and its partitioning across
4409   processors, while `d_nz`,`d_nnz`,`o_nz`,`o_nnz` parameters specify the approximate
4410   storage requirements for this matrix.
4411 
4412   If `PETSC_DECIDE` or  `PETSC_DETERMINE` is used for a particular argument on one
4413   processor than it must be used on all processors that share the object for
4414   that argument.
4415 
4416   If `m` and `n` are not `PETSC_DECIDE`, then the values determine the `PetscLayout` of the matrix and the ranges returned by
4417   `MatGetOwnershipRange()`, `MatGetOwnershipRanges()`, `MatGetOwnershipRangeColumn()`, and `MatGetOwnershipRangesColumn()`.
4418 
4419   The user MUST specify either the local or global matrix dimensions
4420   (possibly both).
4421 
4422   The parallel matrix is partitioned across processors such that the
4423   first `m0` rows belong to process 0, the next `m1` rows belong to
4424   process 1, the next `m2` rows belong to process 2, etc., where
4425   `m0`, `m1`, `m2`... are the input parameter `m` on each MPI process. I.e., each MPI process stores
4426   values corresponding to [m x N] submatrix.
4427 
4428   The columns are logically partitioned with the n0 columns belonging
4429   to 0th partition, the next n1 columns belonging to the next
4430   partition etc.. where n0,n1,n2... are the input parameter 'n'.
4431 
4432   The DIAGONAL portion of the local submatrix on any given processor
4433   is the submatrix corresponding to the rows and columns m,n
4434   corresponding to the given processor. i.e diagonal matrix on
4435   process 0 is [m0 x n0], diagonal matrix on process 1 is [m1 x n1]
4436   etc. The remaining portion of the local submatrix [m x (N-n)]
4437   constitute the OFF-DIAGONAL portion. The example below better
4438   illustrates this concept. The two matrices, the DIAGONAL portion and
4439   the OFF-DIAGONAL portion are each stored as `MATSEQAIJ` matrices.
4440 
4441   For a square global matrix we define each processor's diagonal portion
4442   to be its local rows and the corresponding columns (a square submatrix);
4443   each processor's off-diagonal portion encompasses the remainder of the
4444   local matrix (a rectangular submatrix).
4445 
4446   If `o_nnz`, `d_nnz` are specified, then `o_nz`, and `d_nz` are ignored.
4447 
4448   When calling this routine with a single process communicator, a matrix of
4449   type `MATSEQAIJ` is returned.  If a matrix of type `MATMPIAIJ` is desired for this
4450   type of communicator, use the construction mechanism
4451 .vb
4452   MatCreate(..., &A);
4453   MatSetType(A, MATMPIAIJ);
4454   MatSetSizes(A, m, n, M, N);
4455   MatMPIAIJSetPreallocation(A, ...);
4456 .ve
4457 
4458   By default, this format uses inodes (identical nodes) when possible.
4459   We search for consecutive rows with the same nonzero structure, thereby
4460   reusing matrix information to achieve increased efficiency.
4461 
4462   Example Usage:
4463   Consider the following 8x8 matrix with 34 non-zero values, that is
4464   assembled across 3 processors. Lets assume that proc0 owns 3 rows,
4465   proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
4466   as follows
4467 
4468 .vb
4469             1  2  0  |  0  3  0  |  0  4
4470     Proc0   0  5  6  |  7  0  0  |  8  0
4471             9  0 10  | 11  0  0  | 12  0
4472     -------------------------------------
4473            13  0 14  | 15 16 17  |  0  0
4474     Proc1   0 18  0  | 19 20 21  |  0  0
4475             0  0  0  | 22 23  0  | 24  0
4476     -------------------------------------
4477     Proc2  25 26 27  |  0  0 28  | 29  0
4478            30  0  0  | 31 32 33  |  0 34
4479 .ve
4480 
4481   This can be represented as a collection of submatrices as
4482 
4483 .vb
4484       A B C
4485       D E F
4486       G H I
4487 .ve
4488 
4489   Where the submatrices A,B,C are owned by proc0, D,E,F are
4490   owned by proc1, G,H,I are owned by proc2.
4491 
4492   The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4493   The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4494   The 'M','N' parameters are 8,8, and have the same values on all procs.
4495 
4496   The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
4497   submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
4498   corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
4499   Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
4500   part as `MATSEQAIJ` matrices. For example, proc1 will store [E] as a `MATSEQAIJ`
4501   matrix, and [DF] as another SeqAIJ matrix.
4502 
4503   When `d_nz`, `o_nz` parameters are specified, `d_nz` storage elements are
4504   allocated for every row of the local DIAGONAL submatrix, and `o_nz`
4505   storage locations are allocated for every row of the OFF-DIAGONAL submatrix.
4506   One way to choose `d_nz` and `o_nz` is to use the maximum number of nonzeros over
4507   the local rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
4508   In this case, the values of `d_nz`,`o_nz` are
4509 .vb
4510      proc0  dnz = 2, o_nz = 2
4511      proc1  dnz = 3, o_nz = 2
4512      proc2  dnz = 1, o_nz = 4
4513 .ve
4514   We are allocating m*(`d_nz`+`o_nz`) storage locations for every proc. This
4515   translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
4516   for proc3. i.e we are using 12+15+10=37 storage locations to store
4517   34 values.
4518 
4519   When `d_nnz`, `o_nnz` parameters are specified, the storage is specified
4520   for every row, corresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
4521   In the above case the values for d_nnz,o_nnz are
4522 .vb
4523      proc0 d_nnz = [2,2,2] and o_nnz = [2,2,2]
4524      proc1 d_nnz = [3,3,2] and o_nnz = [2,1,1]
4525      proc2 d_nnz = [1,1]   and o_nnz = [4,4]
4526 .ve
4527   Here the space allocated is sum of all the above values i.e 34, and
4528   hence pre-allocation is perfect.
4529 
4530 .seealso: [](ch_matrices), `Mat`, [Sparse Matrix Creation](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4531           `MATMPIAIJ`, `MatCreateMPIAIJWithArrays()`, `MatGetOwnershipRange()`, `MatGetOwnershipRanges()`, `MatGetOwnershipRangeColumn()`,
4532           `MatGetOwnershipRangesColumn()`, `PetscLayout`
4533 @*/
4534 PetscErrorCode MatCreateAIJ(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
4535 {
4536   PetscMPIInt size;
4537 
4538   PetscFunctionBegin;
4539   PetscCall(MatCreate(comm, A));
4540   PetscCall(MatSetSizes(*A, m, n, M, N));
4541   PetscCallMPI(MPI_Comm_size(comm, &size));
4542   if (size > 1) {
4543     PetscCall(MatSetType(*A, MATMPIAIJ));
4544     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
4545   } else {
4546     PetscCall(MatSetType(*A, MATSEQAIJ));
4547     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
4548   }
4549   PetscFunctionReturn(PETSC_SUCCESS);
4550 }
4551 
4552 /*@C
4553   MatMPIAIJGetSeqAIJ - Returns the local pieces of this distributed matrix
4554 
4555   Not Collective
4556 
4557   Input Parameter:
4558 . A - The `MATMPIAIJ` matrix
4559 
4560   Output Parameters:
4561 + Ad     - The local diagonal block as a `MATSEQAIJ` matrix
4562 . Ao     - The local off-diagonal block as a `MATSEQAIJ` matrix
4563 - colmap - An array mapping local column numbers of `Ao` to global column numbers of the parallel matrix
4564 
4565   Level: intermediate
4566 
4567   Note:
4568   The rows in `Ad` and `Ao` are in [0, Nr), where Nr is the number of local rows on this process. The columns
4569   in `Ad` are in [0, Nc) where Nc is the number of local columns. The columns are `Ao` are in [0, Nco), where Nco is
4570   the number of nonzero columns in the local off-diagonal piece of the matrix `A`. The array colmap maps these
4571   local column numbers to global column numbers in the original matrix.
4572 
4573 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatMPIAIJGetLocalMat()`, `MatMPIAIJGetLocalMatCondensed()`, `MatCreateAIJ()`, `MATSEQAIJ`
4574 @*/
4575 PetscErrorCode MatMPIAIJGetSeqAIJ(Mat A, Mat *Ad, Mat *Ao, const PetscInt *colmap[])
4576 {
4577   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
4578   PetscBool   flg;
4579 
4580   PetscFunctionBegin;
4581   PetscCall(PetscStrbeginswith(((PetscObject)A)->type_name, MATMPIAIJ, &flg));
4582   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "This function requires a MATMPIAIJ matrix as input");
4583   if (Ad) *Ad = a->A;
4584   if (Ao) *Ao = a->B;
4585   if (colmap) *colmap = a->garray;
4586   PetscFunctionReturn(PETSC_SUCCESS);
4587 }
4588 
4589 PetscErrorCode MatCreateMPIMatConcatenateSeqMat_MPIAIJ(MPI_Comm comm, Mat inmat, PetscInt n, MatReuse scall, Mat *outmat)
4590 {
4591   PetscInt     m, N, i, rstart, nnz, Ii;
4592   PetscInt    *indx;
4593   PetscScalar *values;
4594   MatType      rootType;
4595 
4596   PetscFunctionBegin;
4597   PetscCall(MatGetSize(inmat, &m, &N));
4598   if (scall == MAT_INITIAL_MATRIX) { /* symbolic phase */
4599     PetscInt *dnz, *onz, sum, bs, cbs;
4600 
4601     if (n == PETSC_DECIDE) PetscCall(PetscSplitOwnership(comm, &n, &N));
4602     /* Check sum(n) = N */
4603     PetscCallMPI(MPIU_Allreduce(&n, &sum, 1, MPIU_INT, MPI_SUM, comm));
4604     PetscCheck(sum == N, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Sum of local columns %" PetscInt_FMT " != global columns %" PetscInt_FMT, sum, N);
4605 
4606     PetscCallMPI(MPI_Scan(&m, &rstart, 1, MPIU_INT, MPI_SUM, comm));
4607     rstart -= m;
4608 
4609     MatPreallocateBegin(comm, m, n, dnz, onz);
4610     for (i = 0; i < m; i++) {
4611       PetscCall(MatGetRow_SeqAIJ(inmat, i, &nnz, &indx, NULL));
4612       PetscCall(MatPreallocateSet(i + rstart, nnz, indx, dnz, onz));
4613       PetscCall(MatRestoreRow_SeqAIJ(inmat, i, &nnz, &indx, NULL));
4614     }
4615 
4616     PetscCall(MatCreate(comm, outmat));
4617     PetscCall(MatSetSizes(*outmat, m, n, PETSC_DETERMINE, PETSC_DETERMINE));
4618     PetscCall(MatGetBlockSizes(inmat, &bs, &cbs));
4619     PetscCall(MatSetBlockSizes(*outmat, bs, cbs));
4620     PetscCall(MatGetRootType_Private(inmat, &rootType));
4621     PetscCall(MatSetType(*outmat, rootType));
4622     PetscCall(MatSeqAIJSetPreallocation(*outmat, 0, dnz));
4623     PetscCall(MatMPIAIJSetPreallocation(*outmat, 0, dnz, 0, onz));
4624     MatPreallocateEnd(dnz, onz);
4625     PetscCall(MatSetOption(*outmat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
4626   }
4627 
4628   /* numeric phase */
4629   PetscCall(MatGetOwnershipRange(*outmat, &rstart, NULL));
4630   for (i = 0; i < m; i++) {
4631     PetscCall(MatGetRow_SeqAIJ(inmat, i, &nnz, &indx, &values));
4632     Ii = i + rstart;
4633     PetscCall(MatSetValues(*outmat, 1, &Ii, nnz, indx, values, INSERT_VALUES));
4634     PetscCall(MatRestoreRow_SeqAIJ(inmat, i, &nnz, &indx, &values));
4635   }
4636   PetscCall(MatAssemblyBegin(*outmat, MAT_FINAL_ASSEMBLY));
4637   PetscCall(MatAssemblyEnd(*outmat, MAT_FINAL_ASSEMBLY));
4638   PetscFunctionReturn(PETSC_SUCCESS);
4639 }
4640 
4641 static PetscErrorCode MatDestroy_MPIAIJ_SeqsToMPI(void **data)
4642 {
4643   Mat_Merge_SeqsToMPI *merge = (Mat_Merge_SeqsToMPI *)*data;
4644 
4645   PetscFunctionBegin;
4646   if (!merge) PetscFunctionReturn(PETSC_SUCCESS);
4647   PetscCall(PetscFree(merge->id_r));
4648   PetscCall(PetscFree(merge->len_s));
4649   PetscCall(PetscFree(merge->len_r));
4650   PetscCall(PetscFree(merge->bi));
4651   PetscCall(PetscFree(merge->bj));
4652   PetscCall(PetscFree(merge->buf_ri[0]));
4653   PetscCall(PetscFree(merge->buf_ri));
4654   PetscCall(PetscFree(merge->buf_rj[0]));
4655   PetscCall(PetscFree(merge->buf_rj));
4656   PetscCall(PetscFree(merge->coi));
4657   PetscCall(PetscFree(merge->coj));
4658   PetscCall(PetscFree(merge->owners_co));
4659   PetscCall(PetscLayoutDestroy(&merge->rowmap));
4660   PetscCall(PetscFree(merge));
4661   PetscFunctionReturn(PETSC_SUCCESS);
4662 }
4663 
4664 #include <../src/mat/utils/freespace.h>
4665 #include <petscbt.h>
4666 
4667 PetscErrorCode MatCreateMPIAIJSumSeqAIJNumeric(Mat seqmat, Mat mpimat)
4668 {
4669   MPI_Comm             comm;
4670   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)seqmat->data;
4671   PetscMPIInt          size, rank, taga, *len_s;
4672   PetscInt             N = mpimat->cmap->N, i, j, *owners, *ai = a->i, *aj, m;
4673   PetscMPIInt          proc, k;
4674   PetscInt           **buf_ri, **buf_rj;
4675   PetscInt             anzi, *bj_i, *bi, *bj, arow, bnzi, nextaj;
4676   PetscInt             nrows, **buf_ri_k, **nextrow, **nextai;
4677   MPI_Request         *s_waits, *r_waits;
4678   MPI_Status          *status;
4679   const MatScalar     *aa, *a_a;
4680   MatScalar          **abuf_r, *ba_i;
4681   Mat_Merge_SeqsToMPI *merge;
4682   PetscContainer       container;
4683 
4684   PetscFunctionBegin;
4685   PetscCall(PetscObjectGetComm((PetscObject)mpimat, &comm));
4686   PetscCall(PetscLogEventBegin(MAT_Seqstompinum, seqmat, 0, 0, 0));
4687 
4688   PetscCallMPI(MPI_Comm_size(comm, &size));
4689   PetscCallMPI(MPI_Comm_rank(comm, &rank));
4690 
4691   PetscCall(PetscObjectQuery((PetscObject)mpimat, "MatMergeSeqsToMPI", (PetscObject *)&container));
4692   PetscCheck(container, PetscObjectComm((PetscObject)mpimat), PETSC_ERR_PLIB, "Mat not created from MatCreateMPIAIJSumSeqAIJSymbolic");
4693   PetscCall(PetscContainerGetPointer(container, (void **)&merge));
4694   PetscCall(MatSeqAIJGetArrayRead(seqmat, &a_a));
4695   aa = a_a;
4696 
4697   bi     = merge->bi;
4698   bj     = merge->bj;
4699   buf_ri = merge->buf_ri;
4700   buf_rj = merge->buf_rj;
4701 
4702   PetscCall(PetscMalloc1(size, &status));
4703   owners = merge->rowmap->range;
4704   len_s  = merge->len_s;
4705 
4706   /* send and recv matrix values */
4707   PetscCall(PetscObjectGetNewTag((PetscObject)mpimat, &taga));
4708   PetscCall(PetscPostIrecvScalar(comm, taga, merge->nrecv, merge->id_r, merge->len_r, &abuf_r, &r_waits));
4709 
4710   PetscCall(PetscMalloc1(merge->nsend + 1, &s_waits));
4711   for (proc = 0, k = 0; proc < size; proc++) {
4712     if (!len_s[proc]) continue;
4713     i = owners[proc];
4714     PetscCallMPI(MPIU_Isend(aa + ai[i], len_s[proc], MPIU_MATSCALAR, proc, taga, comm, s_waits + k));
4715     k++;
4716   }
4717 
4718   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, r_waits, status));
4719   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, s_waits, status));
4720   PetscCall(PetscFree(status));
4721 
4722   PetscCall(PetscFree(s_waits));
4723   PetscCall(PetscFree(r_waits));
4724 
4725   /* insert mat values of mpimat */
4726   PetscCall(PetscMalloc1(N, &ba_i));
4727   PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextai));
4728 
4729   for (k = 0; k < merge->nrecv; k++) {
4730     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
4731     nrows       = *buf_ri_k[k];
4732     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
4733     nextai[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
4734   }
4735 
4736   /* set values of ba */
4737   m = merge->rowmap->n;
4738   for (i = 0; i < m; i++) {
4739     arow = owners[rank] + i;
4740     bj_i = bj + bi[i]; /* col indices of the i-th row of mpimat */
4741     bnzi = bi[i + 1] - bi[i];
4742     PetscCall(PetscArrayzero(ba_i, bnzi));
4743 
4744     /* add local non-zero vals of this proc's seqmat into ba */
4745     anzi   = ai[arow + 1] - ai[arow];
4746     aj     = a->j + ai[arow];
4747     aa     = a_a + ai[arow];
4748     nextaj = 0;
4749     for (j = 0; nextaj < anzi; j++) {
4750       if (*(bj_i + j) == aj[nextaj]) { /* bcol == acol */
4751         ba_i[j] += aa[nextaj++];
4752       }
4753     }
4754 
4755     /* add received vals into ba */
4756     for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
4757       /* i-th row */
4758       if (i == *nextrow[k]) {
4759         anzi   = *(nextai[k] + 1) - *nextai[k];
4760         aj     = buf_rj[k] + *nextai[k];
4761         aa     = abuf_r[k] + *nextai[k];
4762         nextaj = 0;
4763         for (j = 0; nextaj < anzi; j++) {
4764           if (*(bj_i + j) == aj[nextaj]) { /* bcol == acol */
4765             ba_i[j] += aa[nextaj++];
4766           }
4767         }
4768         nextrow[k]++;
4769         nextai[k]++;
4770       }
4771     }
4772     PetscCall(MatSetValues(mpimat, 1, &arow, bnzi, bj_i, ba_i, INSERT_VALUES));
4773   }
4774   PetscCall(MatSeqAIJRestoreArrayRead(seqmat, &a_a));
4775   PetscCall(MatAssemblyBegin(mpimat, MAT_FINAL_ASSEMBLY));
4776   PetscCall(MatAssemblyEnd(mpimat, MAT_FINAL_ASSEMBLY));
4777 
4778   PetscCall(PetscFree(abuf_r[0]));
4779   PetscCall(PetscFree(abuf_r));
4780   PetscCall(PetscFree(ba_i));
4781   PetscCall(PetscFree3(buf_ri_k, nextrow, nextai));
4782   PetscCall(PetscLogEventEnd(MAT_Seqstompinum, seqmat, 0, 0, 0));
4783   PetscFunctionReturn(PETSC_SUCCESS);
4784 }
4785 
4786 PetscErrorCode MatCreateMPIAIJSumSeqAIJSymbolic(MPI_Comm comm, Mat seqmat, PetscInt m, PetscInt n, Mat *mpimat)
4787 {
4788   Mat                  B_mpi;
4789   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)seqmat->data;
4790   PetscMPIInt          size, rank, tagi, tagj, *len_s, *len_si, *len_ri;
4791   PetscInt           **buf_rj, **buf_ri, **buf_ri_k;
4792   PetscInt             M = seqmat->rmap->n, N = seqmat->cmap->n, i, *owners, *ai = a->i, *aj = a->j;
4793   PetscInt             len, *dnz, *onz, bs, cbs;
4794   PetscInt             k, anzi, *bi, *bj, *lnk, nlnk, arow, bnzi;
4795   PetscInt             nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextai;
4796   MPI_Request         *si_waits, *sj_waits, *ri_waits, *rj_waits;
4797   MPI_Status          *status;
4798   PetscFreeSpaceList   free_space = NULL, current_space = NULL;
4799   PetscBT              lnkbt;
4800   Mat_Merge_SeqsToMPI *merge;
4801   PetscContainer       container;
4802 
4803   PetscFunctionBegin;
4804   PetscCall(PetscLogEventBegin(MAT_Seqstompisym, seqmat, 0, 0, 0));
4805 
4806   /* make sure it is a PETSc comm */
4807   PetscCall(PetscCommDuplicate(comm, &comm, NULL));
4808   PetscCallMPI(MPI_Comm_size(comm, &size));
4809   PetscCallMPI(MPI_Comm_rank(comm, &rank));
4810 
4811   PetscCall(PetscNew(&merge));
4812   PetscCall(PetscMalloc1(size, &status));
4813 
4814   /* determine row ownership */
4815   PetscCall(PetscLayoutCreate(comm, &merge->rowmap));
4816   PetscCall(PetscLayoutSetLocalSize(merge->rowmap, m));
4817   PetscCall(PetscLayoutSetSize(merge->rowmap, M));
4818   PetscCall(PetscLayoutSetBlockSize(merge->rowmap, 1));
4819   PetscCall(PetscLayoutSetUp(merge->rowmap));
4820   PetscCall(PetscMalloc1(size, &len_si));
4821   PetscCall(PetscMalloc1(size, &merge->len_s));
4822 
4823   m      = merge->rowmap->n;
4824   owners = merge->rowmap->range;
4825 
4826   /* determine the number of messages to send, their lengths */
4827   len_s = merge->len_s;
4828 
4829   len          = 0; /* length of buf_si[] */
4830   merge->nsend = 0;
4831   for (PetscMPIInt proc = 0; proc < size; proc++) {
4832     len_si[proc] = 0;
4833     if (proc == rank) {
4834       len_s[proc] = 0;
4835     } else {
4836       PetscCall(PetscMPIIntCast(owners[proc + 1] - owners[proc] + 1, &len_si[proc]));
4837       PetscCall(PetscMPIIntCast(ai[owners[proc + 1]] - ai[owners[proc]], &len_s[proc])); /* num of rows to be sent to [proc] */
4838     }
4839     if (len_s[proc]) {
4840       merge->nsend++;
4841       nrows = 0;
4842       for (i = owners[proc]; i < owners[proc + 1]; i++) {
4843         if (ai[i + 1] > ai[i]) nrows++;
4844       }
4845       PetscCall(PetscMPIIntCast(2 * (nrows + 1), &len_si[proc]));
4846       len += len_si[proc];
4847     }
4848   }
4849 
4850   /* determine the number and length of messages to receive for ij-structure */
4851   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &merge->nrecv));
4852   PetscCall(PetscGatherMessageLengths2(comm, merge->nsend, merge->nrecv, len_s, len_si, &merge->id_r, &merge->len_r, &len_ri));
4853 
4854   /* post the Irecv of j-structure */
4855   PetscCall(PetscCommGetNewTag(comm, &tagj));
4856   PetscCall(PetscPostIrecvInt(comm, tagj, merge->nrecv, merge->id_r, merge->len_r, &buf_rj, &rj_waits));
4857 
4858   /* post the Isend of j-structure */
4859   PetscCall(PetscMalloc2(merge->nsend, &si_waits, merge->nsend, &sj_waits));
4860 
4861   for (PetscMPIInt proc = 0, k = 0; proc < size; proc++) {
4862     if (!len_s[proc]) continue;
4863     i = owners[proc];
4864     PetscCallMPI(MPIU_Isend(aj + ai[i], len_s[proc], MPIU_INT, proc, tagj, comm, sj_waits + k));
4865     k++;
4866   }
4867 
4868   /* receives and sends of j-structure are complete */
4869   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, rj_waits, status));
4870   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, sj_waits, status));
4871 
4872   /* send and recv i-structure */
4873   PetscCall(PetscCommGetNewTag(comm, &tagi));
4874   PetscCall(PetscPostIrecvInt(comm, tagi, merge->nrecv, merge->id_r, len_ri, &buf_ri, &ri_waits));
4875 
4876   PetscCall(PetscMalloc1(len + 1, &buf_s));
4877   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
4878   for (PetscMPIInt proc = 0, k = 0; proc < size; proc++) {
4879     if (!len_s[proc]) continue;
4880     /* form outgoing message for i-structure:
4881          buf_si[0]:                 nrows to be sent
4882                [1:nrows]:           row index (global)
4883                [nrows+1:2*nrows+1]: i-structure index
4884     */
4885     nrows       = len_si[proc] / 2 - 1;
4886     buf_si_i    = buf_si + nrows + 1;
4887     buf_si[0]   = nrows;
4888     buf_si_i[0] = 0;
4889     nrows       = 0;
4890     for (i = owners[proc]; i < owners[proc + 1]; i++) {
4891       anzi = ai[i + 1] - ai[i];
4892       if (anzi) {
4893         buf_si_i[nrows + 1] = buf_si_i[nrows] + anzi; /* i-structure */
4894         buf_si[nrows + 1]   = i - owners[proc];       /* local row index */
4895         nrows++;
4896       }
4897     }
4898     PetscCallMPI(MPIU_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, si_waits + k));
4899     k++;
4900     buf_si += len_si[proc];
4901   }
4902 
4903   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, ri_waits, status));
4904   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, si_waits, status));
4905 
4906   PetscCall(PetscInfo(seqmat, "nsend: %d, nrecv: %d\n", merge->nsend, merge->nrecv));
4907   for (i = 0; i < merge->nrecv; i++) PetscCall(PetscInfo(seqmat, "recv len_ri=%d, len_rj=%d from [%d]\n", len_ri[i], merge->len_r[i], merge->id_r[i]));
4908 
4909   PetscCall(PetscFree(len_si));
4910   PetscCall(PetscFree(len_ri));
4911   PetscCall(PetscFree(rj_waits));
4912   PetscCall(PetscFree2(si_waits, sj_waits));
4913   PetscCall(PetscFree(ri_waits));
4914   PetscCall(PetscFree(buf_s));
4915   PetscCall(PetscFree(status));
4916 
4917   /* compute a local seq matrix in each processor */
4918   /* allocate bi array and free space for accumulating nonzero column info */
4919   PetscCall(PetscMalloc1(m + 1, &bi));
4920   bi[0] = 0;
4921 
4922   /* create and initialize a linked list */
4923   nlnk = N + 1;
4924   PetscCall(PetscLLCreate(N, N, nlnk, lnk, lnkbt));
4925 
4926   /* initial FreeSpace size is 2*(num of local nnz(seqmat)) */
4927   len = ai[owners[rank + 1]] - ai[owners[rank]];
4928   PetscCall(PetscFreeSpaceGet(PetscIntMultTruncate(2, len) + 1, &free_space));
4929 
4930   current_space = free_space;
4931 
4932   /* determine symbolic info for each local row */
4933   PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextai));
4934 
4935   for (k = 0; k < merge->nrecv; k++) {
4936     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
4937     nrows       = *buf_ri_k[k];
4938     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
4939     nextai[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
4940   }
4941 
4942   MatPreallocateBegin(comm, m, n, dnz, onz);
4943   len = 0;
4944   for (i = 0; i < m; i++) {
4945     bnzi = 0;
4946     /* add local non-zero cols of this proc's seqmat into lnk */
4947     arow = owners[rank] + i;
4948     anzi = ai[arow + 1] - ai[arow];
4949     aj   = a->j + ai[arow];
4950     PetscCall(PetscLLAddSorted(anzi, aj, N, &nlnk, lnk, lnkbt));
4951     bnzi += nlnk;
4952     /* add received col data into lnk */
4953     for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
4954       if (i == *nextrow[k]) {            /* i-th row */
4955         anzi = *(nextai[k] + 1) - *nextai[k];
4956         aj   = buf_rj[k] + *nextai[k];
4957         PetscCall(PetscLLAddSorted(anzi, aj, N, &nlnk, lnk, lnkbt));
4958         bnzi += nlnk;
4959         nextrow[k]++;
4960         nextai[k]++;
4961       }
4962     }
4963     if (len < bnzi) len = bnzi; /* =max(bnzi) */
4964 
4965     /* if free space is not available, make more free space */
4966     if (current_space->local_remaining < bnzi) PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(bnzi, current_space->total_array_size), &current_space));
4967     /* copy data into free space, then initialize lnk */
4968     PetscCall(PetscLLClean(N, N, bnzi, lnk, current_space->array, lnkbt));
4969     PetscCall(MatPreallocateSet(i + owners[rank], bnzi, current_space->array, dnz, onz));
4970 
4971     current_space->array += bnzi;
4972     current_space->local_used += bnzi;
4973     current_space->local_remaining -= bnzi;
4974 
4975     bi[i + 1] = bi[i] + bnzi;
4976   }
4977 
4978   PetscCall(PetscFree3(buf_ri_k, nextrow, nextai));
4979 
4980   PetscCall(PetscMalloc1(bi[m] + 1, &bj));
4981   PetscCall(PetscFreeSpaceContiguous(&free_space, bj));
4982   PetscCall(PetscLLDestroy(lnk, lnkbt));
4983 
4984   /* create symbolic parallel matrix B_mpi */
4985   PetscCall(MatGetBlockSizes(seqmat, &bs, &cbs));
4986   PetscCall(MatCreate(comm, &B_mpi));
4987   if (n == PETSC_DECIDE) {
4988     PetscCall(MatSetSizes(B_mpi, m, n, PETSC_DETERMINE, N));
4989   } else {
4990     PetscCall(MatSetSizes(B_mpi, m, n, PETSC_DETERMINE, PETSC_DETERMINE));
4991   }
4992   PetscCall(MatSetBlockSizes(B_mpi, bs, cbs));
4993   PetscCall(MatSetType(B_mpi, MATMPIAIJ));
4994   PetscCall(MatMPIAIJSetPreallocation(B_mpi, 0, dnz, 0, onz));
4995   MatPreallocateEnd(dnz, onz);
4996   PetscCall(MatSetOption(B_mpi, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE));
4997 
4998   /* B_mpi is not ready for use - assembly will be done by MatCreateMPIAIJSumSeqAIJNumeric() */
4999   B_mpi->assembled = PETSC_FALSE;
5000   merge->bi        = bi;
5001   merge->bj        = bj;
5002   merge->buf_ri    = buf_ri;
5003   merge->buf_rj    = buf_rj;
5004   merge->coi       = NULL;
5005   merge->coj       = NULL;
5006   merge->owners_co = NULL;
5007 
5008   PetscCall(PetscCommDestroy(&comm));
5009 
5010   /* attach the supporting struct to B_mpi for reuse */
5011   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container));
5012   PetscCall(PetscContainerSetPointer(container, merge));
5013   PetscCall(PetscContainerSetCtxDestroy(container, MatDestroy_MPIAIJ_SeqsToMPI));
5014   PetscCall(PetscObjectCompose((PetscObject)B_mpi, "MatMergeSeqsToMPI", (PetscObject)container));
5015   PetscCall(PetscContainerDestroy(&container));
5016   *mpimat = B_mpi;
5017 
5018   PetscCall(PetscLogEventEnd(MAT_Seqstompisym, seqmat, 0, 0, 0));
5019   PetscFunctionReturn(PETSC_SUCCESS);
5020 }
5021 
5022 /*@
5023   MatCreateMPIAIJSumSeqAIJ - Creates a `MATMPIAIJ` matrix by adding sequential
5024   matrices from each processor
5025 
5026   Collective
5027 
5028   Input Parameters:
5029 + comm   - the communicators the parallel matrix will live on
5030 . seqmat - the input sequential matrices
5031 . m      - number of local rows (or `PETSC_DECIDE`)
5032 . n      - number of local columns (or `PETSC_DECIDE`)
5033 - scall  - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5034 
5035   Output Parameter:
5036 . mpimat - the parallel matrix generated
5037 
5038   Level: advanced
5039 
5040   Note:
5041   The dimensions of the sequential matrix in each processor MUST be the same.
5042   The input seqmat is included into the container "Mat_Merge_SeqsToMPI", and will be
5043   destroyed when `mpimat` is destroyed. Call `PetscObjectQuery()` to access `seqmat`.
5044 
5045 .seealso: [](ch_matrices), `Mat`, `MatCreateAIJ()`
5046 @*/
5047 PetscErrorCode MatCreateMPIAIJSumSeqAIJ(MPI_Comm comm, Mat seqmat, PetscInt m, PetscInt n, MatReuse scall, Mat *mpimat)
5048 {
5049   PetscMPIInt size;
5050 
5051   PetscFunctionBegin;
5052   PetscCallMPI(MPI_Comm_size(comm, &size));
5053   if (size == 1) {
5054     PetscCall(PetscLogEventBegin(MAT_Seqstompi, seqmat, 0, 0, 0));
5055     if (scall == MAT_INITIAL_MATRIX) {
5056       PetscCall(MatDuplicate(seqmat, MAT_COPY_VALUES, mpimat));
5057     } else {
5058       PetscCall(MatCopy(seqmat, *mpimat, SAME_NONZERO_PATTERN));
5059     }
5060     PetscCall(PetscLogEventEnd(MAT_Seqstompi, seqmat, 0, 0, 0));
5061     PetscFunctionReturn(PETSC_SUCCESS);
5062   }
5063   PetscCall(PetscLogEventBegin(MAT_Seqstompi, seqmat, 0, 0, 0));
5064   if (scall == MAT_INITIAL_MATRIX) PetscCall(MatCreateMPIAIJSumSeqAIJSymbolic(comm, seqmat, m, n, mpimat));
5065   PetscCall(MatCreateMPIAIJSumSeqAIJNumeric(seqmat, *mpimat));
5066   PetscCall(PetscLogEventEnd(MAT_Seqstompi, seqmat, 0, 0, 0));
5067   PetscFunctionReturn(PETSC_SUCCESS);
5068 }
5069 
5070 /*@
5071   MatAIJGetLocalMat - Creates a `MATSEQAIJ` from a `MATAIJ` matrix.
5072 
5073   Not Collective
5074 
5075   Input Parameter:
5076 . A - the matrix
5077 
5078   Output Parameter:
5079 . A_loc - the local sequential matrix generated
5080 
5081   Level: developer
5082 
5083   Notes:
5084   The matrix is created by taking `A`'s local rows and putting them into a sequential matrix
5085   with `mlocal` rows and `n` columns. Where `mlocal` is obtained with `MatGetLocalSize()` and
5086   `n` is the global column count obtained with `MatGetSize()`
5087 
5088   In other words combines the two parts of a parallel `MATMPIAIJ` matrix on each process to a single matrix.
5089 
5090   For parallel matrices this creates an entirely new matrix. If the matrix is sequential it merely increases the reference count.
5091 
5092   Destroy the matrix with `MatDestroy()`
5093 
5094 .seealso: [](ch_matrices), `Mat`, `MatMPIAIJGetLocalMat()`
5095 @*/
5096 PetscErrorCode MatAIJGetLocalMat(Mat A, Mat *A_loc)
5097 {
5098   PetscBool mpi;
5099 
5100   PetscFunctionBegin;
5101   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIAIJ, &mpi));
5102   if (mpi) {
5103     PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, A_loc));
5104   } else {
5105     *A_loc = A;
5106     PetscCall(PetscObjectReference((PetscObject)*A_loc));
5107   }
5108   PetscFunctionReturn(PETSC_SUCCESS);
5109 }
5110 
5111 /*@
5112   MatMPIAIJGetLocalMat - Creates a `MATSEQAIJ` from a `MATMPIAIJ` matrix.
5113 
5114   Not Collective
5115 
5116   Input Parameters:
5117 + A     - the matrix
5118 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5119 
5120   Output Parameter:
5121 . A_loc - the local sequential matrix generated
5122 
5123   Level: developer
5124 
5125   Notes:
5126   The matrix is created by taking all `A`'s local rows and putting them into a sequential
5127   matrix with `mlocal` rows and `n` columns.`mlocal` is the row count obtained with
5128   `MatGetLocalSize()` and `n` is the global column count obtained with `MatGetSize()`.
5129 
5130   In other words combines the two parts of a parallel `MATMPIAIJ` matrix on each process to a single matrix.
5131 
5132   When `A` is sequential and `MAT_INITIAL_MATRIX` is requested, the matrix returned is the diagonal part of `A` (which contains the entire matrix),
5133   with its reference count increased by one. Hence changing values of `A_loc` changes `A`. If `MAT_REUSE_MATRIX` is requested on a sequential matrix
5134   then `MatCopy`(Adiag,*`A_loc`,`SAME_NONZERO_PATTERN`) is called to fill `A_loc`. Thus one can preallocate the appropriate sequential matrix `A_loc`
5135   and then call this routine with `MAT_REUSE_MATRIX`. In this case, one can modify the values of `A_loc` without affecting the original sequential matrix.
5136 
5137 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMatCondensed()`, `MatMPIAIJGetLocalMatMerge()`
5138 @*/
5139 PetscErrorCode MatMPIAIJGetLocalMat(Mat A, MatReuse scall, Mat *A_loc)
5140 {
5141   Mat_MPIAIJ        *mpimat = (Mat_MPIAIJ *)A->data;
5142   Mat_SeqAIJ        *mat, *a, *b;
5143   PetscInt          *ai, *aj, *bi, *bj, *cmap = mpimat->garray;
5144   const PetscScalar *aa, *ba, *aav, *bav;
5145   PetscScalar       *ca, *cam;
5146   PetscMPIInt        size;
5147   PetscInt           am = A->rmap->n, i, j, k, cstart = A->cmap->rstart;
5148   PetscInt          *ci, *cj, col, ncols_d, ncols_o, jo;
5149   PetscBool          match;
5150 
5151   PetscFunctionBegin;
5152   PetscCall(PetscStrbeginswith(((PetscObject)A)->type_name, MATMPIAIJ, &match));
5153   PetscCheck(match, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Requires MATMPIAIJ matrix as input");
5154   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
5155   if (size == 1) {
5156     if (scall == MAT_INITIAL_MATRIX) {
5157       PetscCall(PetscObjectReference((PetscObject)mpimat->A));
5158       *A_loc = mpimat->A;
5159     } else if (scall == MAT_REUSE_MATRIX) {
5160       PetscCall(MatCopy(mpimat->A, *A_loc, SAME_NONZERO_PATTERN));
5161     }
5162     PetscFunctionReturn(PETSC_SUCCESS);
5163   }
5164 
5165   PetscCall(PetscLogEventBegin(MAT_Getlocalmat, A, 0, 0, 0));
5166   a  = (Mat_SeqAIJ *)mpimat->A->data;
5167   b  = (Mat_SeqAIJ *)mpimat->B->data;
5168   ai = a->i;
5169   aj = a->j;
5170   bi = b->i;
5171   bj = b->j;
5172   PetscCall(MatSeqAIJGetArrayRead(mpimat->A, &aav));
5173   PetscCall(MatSeqAIJGetArrayRead(mpimat->B, &bav));
5174   aa = aav;
5175   ba = bav;
5176   if (scall == MAT_INITIAL_MATRIX) {
5177     PetscCall(PetscMalloc1(1 + am, &ci));
5178     ci[0] = 0;
5179     for (i = 0; i < am; i++) ci[i + 1] = ci[i] + (ai[i + 1] - ai[i]) + (bi[i + 1] - bi[i]);
5180     PetscCall(PetscMalloc1(1 + ci[am], &cj));
5181     PetscCall(PetscMalloc1(1 + ci[am], &ca));
5182     k = 0;
5183     for (i = 0; i < am; i++) {
5184       ncols_o = bi[i + 1] - bi[i];
5185       ncols_d = ai[i + 1] - ai[i];
5186       /* off-diagonal portion of A */
5187       for (jo = 0; jo < ncols_o; jo++) {
5188         col = cmap[*bj];
5189         if (col >= cstart) break;
5190         cj[k] = col;
5191         bj++;
5192         ca[k++] = *ba++;
5193       }
5194       /* diagonal portion of A */
5195       for (j = 0; j < ncols_d; j++) {
5196         cj[k]   = cstart + *aj++;
5197         ca[k++] = *aa++;
5198       }
5199       /* off-diagonal portion of A */
5200       for (j = jo; j < ncols_o; j++) {
5201         cj[k]   = cmap[*bj++];
5202         ca[k++] = *ba++;
5203       }
5204     }
5205     /* put together the new matrix */
5206     PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, A->cmap->N, ci, cj, ca, A_loc));
5207     /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5208     /* Since these are PETSc arrays, change flags to free them as necessary. */
5209     mat          = (Mat_SeqAIJ *)(*A_loc)->data;
5210     mat->free_a  = PETSC_TRUE;
5211     mat->free_ij = PETSC_TRUE;
5212     mat->nonew   = 0;
5213   } else if (scall == MAT_REUSE_MATRIX) {
5214     mat = (Mat_SeqAIJ *)(*A_loc)->data;
5215     ci  = mat->i;
5216     cj  = mat->j;
5217     PetscCall(MatSeqAIJGetArrayWrite(*A_loc, &cam));
5218     for (i = 0; i < am; i++) {
5219       /* off-diagonal portion of A */
5220       ncols_o = bi[i + 1] - bi[i];
5221       for (jo = 0; jo < ncols_o; jo++) {
5222         col = cmap[*bj];
5223         if (col >= cstart) break;
5224         *cam++ = *ba++;
5225         bj++;
5226       }
5227       /* diagonal portion of A */
5228       ncols_d = ai[i + 1] - ai[i];
5229       for (j = 0; j < ncols_d; j++) *cam++ = *aa++;
5230       /* off-diagonal portion of A */
5231       for (j = jo; j < ncols_o; j++) {
5232         *cam++ = *ba++;
5233         bj++;
5234       }
5235     }
5236     PetscCall(MatSeqAIJRestoreArrayWrite(*A_loc, &cam));
5237   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid MatReuse %d", (int)scall);
5238   PetscCall(MatSeqAIJRestoreArrayRead(mpimat->A, &aav));
5239   PetscCall(MatSeqAIJRestoreArrayRead(mpimat->B, &bav));
5240   PetscCall(PetscLogEventEnd(MAT_Getlocalmat, A, 0, 0, 0));
5241   PetscFunctionReturn(PETSC_SUCCESS);
5242 }
5243 
5244 /*@
5245   MatMPIAIJGetLocalMatMerge - Creates a `MATSEQAIJ` from a `MATMPIAIJ` matrix by taking all its local rows and putting them into a sequential matrix with
5246   mlocal rows and n columns. Where n is the sum of the number of columns of the diagonal and off-diagonal part
5247 
5248   Not Collective
5249 
5250   Input Parameters:
5251 + A     - the matrix
5252 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5253 
5254   Output Parameters:
5255 + glob  - sequential `IS` with global indices associated with the columns of the local sequential matrix generated (can be `NULL`)
5256 - A_loc - the local sequential matrix generated
5257 
5258   Level: developer
5259 
5260   Note:
5261   This is different from `MatMPIAIJGetLocalMat()` since the first columns in the returning matrix are those associated with the diagonal
5262   part, then those associated with the off-diagonal part (in its local ordering)
5263 
5264 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMat()`, `MatMPIAIJGetLocalMatCondensed()`
5265 @*/
5266 PetscErrorCode MatMPIAIJGetLocalMatMerge(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
5267 {
5268   Mat             Ao, Ad;
5269   const PetscInt *cmap;
5270   PetscMPIInt     size;
5271   PetscErrorCode (*f)(Mat, MatReuse, IS *, Mat *);
5272 
5273   PetscFunctionBegin;
5274   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
5275   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
5276   if (size == 1) {
5277     if (scall == MAT_INITIAL_MATRIX) {
5278       PetscCall(PetscObjectReference((PetscObject)Ad));
5279       *A_loc = Ad;
5280     } else if (scall == MAT_REUSE_MATRIX) {
5281       PetscCall(MatCopy(Ad, *A_loc, SAME_NONZERO_PATTERN));
5282     }
5283     if (glob) PetscCall(ISCreateStride(PetscObjectComm((PetscObject)Ad), Ad->cmap->n, Ad->cmap->rstart, 1, glob));
5284     PetscFunctionReturn(PETSC_SUCCESS);
5285   }
5286   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", &f));
5287   PetscCall(PetscLogEventBegin(MAT_Getlocalmat, A, 0, 0, 0));
5288   if (f) {
5289     PetscCall((*f)(A, scall, glob, A_loc));
5290   } else {
5291     Mat_SeqAIJ        *a = (Mat_SeqAIJ *)Ad->data;
5292     Mat_SeqAIJ        *b = (Mat_SeqAIJ *)Ao->data;
5293     Mat_SeqAIJ        *c;
5294     PetscInt          *ai = a->i, *aj = a->j;
5295     PetscInt          *bi = b->i, *bj = b->j;
5296     PetscInt          *ci, *cj;
5297     const PetscScalar *aa, *ba;
5298     PetscScalar       *ca;
5299     PetscInt           i, j, am, dn, on;
5300 
5301     PetscCall(MatGetLocalSize(Ad, &am, &dn));
5302     PetscCall(MatGetLocalSize(Ao, NULL, &on));
5303     PetscCall(MatSeqAIJGetArrayRead(Ad, &aa));
5304     PetscCall(MatSeqAIJGetArrayRead(Ao, &ba));
5305     if (scall == MAT_INITIAL_MATRIX) {
5306       PetscInt k;
5307       PetscCall(PetscMalloc1(1 + am, &ci));
5308       PetscCall(PetscMalloc1(ai[am] + bi[am], &cj));
5309       PetscCall(PetscMalloc1(ai[am] + bi[am], &ca));
5310       ci[0] = 0;
5311       for (i = 0, k = 0; i < am; i++) {
5312         const PetscInt ncols_o = bi[i + 1] - bi[i];
5313         const PetscInt ncols_d = ai[i + 1] - ai[i];
5314         ci[i + 1]              = ci[i] + ncols_o + ncols_d;
5315         /* diagonal portion of A */
5316         for (j = 0; j < ncols_d; j++, k++) {
5317           cj[k] = *aj++;
5318           ca[k] = *aa++;
5319         }
5320         /* off-diagonal portion of A */
5321         for (j = 0; j < ncols_o; j++, k++) {
5322           cj[k] = dn + *bj++;
5323           ca[k] = *ba++;
5324         }
5325       }
5326       /* put together the new matrix */
5327       PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, dn + on, ci, cj, ca, A_loc));
5328       /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5329       /* Since these are PETSc arrays, change flags to free them as necessary. */
5330       c          = (Mat_SeqAIJ *)(*A_loc)->data;
5331       c->free_a  = PETSC_TRUE;
5332       c->free_ij = PETSC_TRUE;
5333       c->nonew   = 0;
5334       PetscCall(MatSetType(*A_loc, ((PetscObject)Ad)->type_name));
5335     } else if (scall == MAT_REUSE_MATRIX) {
5336       PetscCall(MatSeqAIJGetArrayWrite(*A_loc, &ca));
5337       for (i = 0; i < am; i++) {
5338         const PetscInt ncols_d = ai[i + 1] - ai[i];
5339         const PetscInt ncols_o = bi[i + 1] - bi[i];
5340         /* diagonal portion of A */
5341         for (j = 0; j < ncols_d; j++) *ca++ = *aa++;
5342         /* off-diagonal portion of A */
5343         for (j = 0; j < ncols_o; j++) *ca++ = *ba++;
5344       }
5345       PetscCall(MatSeqAIJRestoreArrayWrite(*A_loc, &ca));
5346     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid MatReuse %d", (int)scall);
5347     PetscCall(MatSeqAIJRestoreArrayRead(Ad, &aa));
5348     PetscCall(MatSeqAIJRestoreArrayRead(Ao, &aa));
5349     if (glob) {
5350       PetscInt cst, *gidx;
5351 
5352       PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
5353       PetscCall(PetscMalloc1(dn + on, &gidx));
5354       for (i = 0; i < dn; i++) gidx[i] = cst + i;
5355       for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
5356       PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
5357     }
5358   }
5359   PetscCall(PetscLogEventEnd(MAT_Getlocalmat, A, 0, 0, 0));
5360   PetscFunctionReturn(PETSC_SUCCESS);
5361 }
5362 
5363 /*@C
5364   MatMPIAIJGetLocalMatCondensed - Creates a `MATSEQAIJ` matrix from an `MATMPIAIJ` matrix by taking all its local rows and NON-ZERO columns
5365 
5366   Not Collective
5367 
5368   Input Parameters:
5369 + A     - the matrix
5370 . scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5371 . row   - index set of rows to extract (or `NULL`)
5372 - col   - index set of columns to extract (or `NULL`)
5373 
5374   Output Parameter:
5375 . A_loc - the local sequential matrix generated
5376 
5377   Level: developer
5378 
5379 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMat()`
5380 @*/
5381 PetscErrorCode MatMPIAIJGetLocalMatCondensed(Mat A, MatReuse scall, IS *row, IS *col, Mat *A_loc)
5382 {
5383   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
5384   PetscInt    i, start, end, ncols, nzA, nzB, *cmap, imark, *idx;
5385   IS          isrowa, iscola;
5386   Mat        *aloc;
5387   PetscBool   match;
5388 
5389   PetscFunctionBegin;
5390   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIAIJ, &match));
5391   PetscCheck(match, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Requires MATMPIAIJ matrix as input");
5392   PetscCall(PetscLogEventBegin(MAT_Getlocalmatcondensed, A, 0, 0, 0));
5393   if (!row) {
5394     start = A->rmap->rstart;
5395     end   = A->rmap->rend;
5396     PetscCall(ISCreateStride(PETSC_COMM_SELF, end - start, start, 1, &isrowa));
5397   } else {
5398     isrowa = *row;
5399   }
5400   if (!col) {
5401     start = A->cmap->rstart;
5402     cmap  = a->garray;
5403     nzA   = a->A->cmap->n;
5404     nzB   = a->B->cmap->n;
5405     PetscCall(PetscMalloc1(nzA + nzB, &idx));
5406     ncols = 0;
5407     for (i = 0; i < nzB; i++) {
5408       if (cmap[i] < start) idx[ncols++] = cmap[i];
5409       else break;
5410     }
5411     imark = i;
5412     for (i = 0; i < nzA; i++) idx[ncols++] = start + i;
5413     for (i = imark; i < nzB; i++) idx[ncols++] = cmap[i];
5414     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, &iscola));
5415   } else {
5416     iscola = *col;
5417   }
5418   if (scall != MAT_INITIAL_MATRIX) {
5419     PetscCall(PetscMalloc1(1, &aloc));
5420     aloc[0] = *A_loc;
5421   }
5422   PetscCall(MatCreateSubMatrices(A, 1, &isrowa, &iscola, scall, &aloc));
5423   if (!col) { /* attach global id of condensed columns */
5424     PetscCall(PetscObjectCompose((PetscObject)aloc[0], "_petsc_GetLocalMatCondensed_iscol", (PetscObject)iscola));
5425   }
5426   *A_loc = aloc[0];
5427   PetscCall(PetscFree(aloc));
5428   if (!row) PetscCall(ISDestroy(&isrowa));
5429   if (!col) PetscCall(ISDestroy(&iscola));
5430   PetscCall(PetscLogEventEnd(MAT_Getlocalmatcondensed, A, 0, 0, 0));
5431   PetscFunctionReturn(PETSC_SUCCESS);
5432 }
5433 
5434 /*
5435  * Create a sequential AIJ matrix based on row indices. a whole column is extracted once a row is matched.
5436  * Row could be local or remote.The routine is designed to be scalable in memory so that nothing is based
5437  * on a global size.
5438  * */
5439 static PetscErrorCode MatCreateSeqSubMatrixWithRows_Private(Mat P, IS rows, Mat *P_oth)
5440 {
5441   Mat_MPIAIJ            *p  = (Mat_MPIAIJ *)P->data;
5442   Mat_SeqAIJ            *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data, *p_oth;
5443   PetscInt               plocalsize, nrows, *ilocal, *oilocal, i, lidx, *nrcols, *nlcols, ncol;
5444   PetscMPIInt            owner;
5445   PetscSFNode           *iremote, *oiremote;
5446   const PetscInt        *lrowindices;
5447   PetscSF                sf, osf;
5448   PetscInt               pcstart, *roffsets, *loffsets, *pnnz, j;
5449   PetscInt               ontotalcols, dntotalcols, ntotalcols, nout;
5450   MPI_Comm               comm;
5451   ISLocalToGlobalMapping mapping;
5452   const PetscScalar     *pd_a, *po_a;
5453 
5454   PetscFunctionBegin;
5455   PetscCall(PetscObjectGetComm((PetscObject)P, &comm));
5456   /* plocalsize is the number of roots
5457    * nrows is the number of leaves
5458    * */
5459   PetscCall(MatGetLocalSize(P, &plocalsize, NULL));
5460   PetscCall(ISGetLocalSize(rows, &nrows));
5461   PetscCall(PetscCalloc1(nrows, &iremote));
5462   PetscCall(ISGetIndices(rows, &lrowindices));
5463   for (i = 0; i < nrows; i++) {
5464     /* Find a remote index and an owner for a row
5465      * The row could be local or remote
5466      * */
5467     owner = 0;
5468     lidx  = 0;
5469     PetscCall(PetscLayoutFindOwnerIndex(P->rmap, lrowindices[i], &owner, &lidx));
5470     iremote[i].index = lidx;
5471     iremote[i].rank  = owner;
5472   }
5473   /* Create SF to communicate how many nonzero columns for each row */
5474   PetscCall(PetscSFCreate(comm, &sf));
5475   /* SF will figure out the number of nonzero columns for each row, and their
5476    * offsets
5477    * */
5478   PetscCall(PetscSFSetGraph(sf, plocalsize, nrows, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
5479   PetscCall(PetscSFSetFromOptions(sf));
5480   PetscCall(PetscSFSetUp(sf));
5481 
5482   PetscCall(PetscCalloc1(2 * (plocalsize + 1), &roffsets));
5483   PetscCall(PetscCalloc1(2 * plocalsize, &nrcols));
5484   PetscCall(PetscCalloc1(nrows, &pnnz));
5485   roffsets[0] = 0;
5486   roffsets[1] = 0;
5487   for (i = 0; i < plocalsize; i++) {
5488     /* diagonal */
5489     nrcols[i * 2 + 0] = pd->i[i + 1] - pd->i[i];
5490     /* off-diagonal */
5491     nrcols[i * 2 + 1] = po->i[i + 1] - po->i[i];
5492     /* compute offsets so that we relative location for each row */
5493     roffsets[(i + 1) * 2 + 0] = roffsets[i * 2 + 0] + nrcols[i * 2 + 0];
5494     roffsets[(i + 1) * 2 + 1] = roffsets[i * 2 + 1] + nrcols[i * 2 + 1];
5495   }
5496   PetscCall(PetscCalloc1(2 * nrows, &nlcols));
5497   PetscCall(PetscCalloc1(2 * nrows, &loffsets));
5498   /* 'r' means root, and 'l' means leaf */
5499   PetscCall(PetscSFBcastBegin(sf, MPIU_2INT, nrcols, nlcols, MPI_REPLACE));
5500   PetscCall(PetscSFBcastBegin(sf, MPIU_2INT, roffsets, loffsets, MPI_REPLACE));
5501   PetscCall(PetscSFBcastEnd(sf, MPIU_2INT, nrcols, nlcols, MPI_REPLACE));
5502   PetscCall(PetscSFBcastEnd(sf, MPIU_2INT, roffsets, loffsets, MPI_REPLACE));
5503   PetscCall(PetscSFDestroy(&sf));
5504   PetscCall(PetscFree(roffsets));
5505   PetscCall(PetscFree(nrcols));
5506   dntotalcols = 0;
5507   ontotalcols = 0;
5508   ncol        = 0;
5509   for (i = 0; i < nrows; i++) {
5510     pnnz[i] = nlcols[i * 2 + 0] + nlcols[i * 2 + 1];
5511     ncol    = PetscMax(pnnz[i], ncol);
5512     /* diagonal */
5513     dntotalcols += nlcols[i * 2 + 0];
5514     /* off-diagonal */
5515     ontotalcols += nlcols[i * 2 + 1];
5516   }
5517   /* We do not need to figure the right number of columns
5518    * since all the calculations will be done by going through the raw data
5519    * */
5520   PetscCall(MatCreateSeqAIJ(PETSC_COMM_SELF, nrows, ncol, 0, pnnz, P_oth));
5521   PetscCall(MatSetUp(*P_oth));
5522   PetscCall(PetscFree(pnnz));
5523   p_oth = (Mat_SeqAIJ *)(*P_oth)->data;
5524   /* diagonal */
5525   PetscCall(PetscCalloc1(dntotalcols, &iremote));
5526   /* off-diagonal */
5527   PetscCall(PetscCalloc1(ontotalcols, &oiremote));
5528   /* diagonal */
5529   PetscCall(PetscCalloc1(dntotalcols, &ilocal));
5530   /* off-diagonal */
5531   PetscCall(PetscCalloc1(ontotalcols, &oilocal));
5532   dntotalcols = 0;
5533   ontotalcols = 0;
5534   ntotalcols  = 0;
5535   for (i = 0; i < nrows; i++) {
5536     owner = 0;
5537     PetscCall(PetscLayoutFindOwnerIndex(P->rmap, lrowindices[i], &owner, NULL));
5538     /* Set iremote for diag matrix */
5539     for (j = 0; j < nlcols[i * 2 + 0]; j++) {
5540       iremote[dntotalcols].index = loffsets[i * 2 + 0] + j;
5541       iremote[dntotalcols].rank  = owner;
5542       /* P_oth is seqAIJ so that ilocal need to point to the first part of memory */
5543       ilocal[dntotalcols++] = ntotalcols++;
5544     }
5545     /* off-diagonal */
5546     for (j = 0; j < nlcols[i * 2 + 1]; j++) {
5547       oiremote[ontotalcols].index = loffsets[i * 2 + 1] + j;
5548       oiremote[ontotalcols].rank  = owner;
5549       oilocal[ontotalcols++]      = ntotalcols++;
5550     }
5551   }
5552   PetscCall(ISRestoreIndices(rows, &lrowindices));
5553   PetscCall(PetscFree(loffsets));
5554   PetscCall(PetscFree(nlcols));
5555   PetscCall(PetscSFCreate(comm, &sf));
5556   /* P serves as roots and P_oth is leaves
5557    * Diag matrix
5558    * */
5559   PetscCall(PetscSFSetGraph(sf, pd->i[plocalsize], dntotalcols, ilocal, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
5560   PetscCall(PetscSFSetFromOptions(sf));
5561   PetscCall(PetscSFSetUp(sf));
5562 
5563   PetscCall(PetscSFCreate(comm, &osf));
5564   /* off-diagonal */
5565   PetscCall(PetscSFSetGraph(osf, po->i[plocalsize], ontotalcols, oilocal, PETSC_OWN_POINTER, oiremote, PETSC_OWN_POINTER));
5566   PetscCall(PetscSFSetFromOptions(osf));
5567   PetscCall(PetscSFSetUp(osf));
5568   PetscCall(MatSeqAIJGetArrayRead(p->A, &pd_a));
5569   PetscCall(MatSeqAIJGetArrayRead(p->B, &po_a));
5570   /* operate on the matrix internal data to save memory */
5571   PetscCall(PetscSFBcastBegin(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5572   PetscCall(PetscSFBcastBegin(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5573   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, NULL));
5574   /* Convert to global indices for diag matrix */
5575   for (i = 0; i < pd->i[plocalsize]; i++) pd->j[i] += pcstart;
5576   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, pd->j, p_oth->j, MPI_REPLACE));
5577   /* We want P_oth store global indices */
5578   PetscCall(ISLocalToGlobalMappingCreate(comm, 1, p->B->cmap->n, p->garray, PETSC_COPY_VALUES, &mapping));
5579   /* Use memory scalable approach */
5580   PetscCall(ISLocalToGlobalMappingSetType(mapping, ISLOCALTOGLOBALMAPPINGHASH));
5581   PetscCall(ISLocalToGlobalMappingApply(mapping, po->i[plocalsize], po->j, po->j));
5582   PetscCall(PetscSFBcastBegin(osf, MPIU_INT, po->j, p_oth->j, MPI_REPLACE));
5583   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, pd->j, p_oth->j, MPI_REPLACE));
5584   /* Convert back to local indices */
5585   for (i = 0; i < pd->i[plocalsize]; i++) pd->j[i] -= pcstart;
5586   PetscCall(PetscSFBcastEnd(osf, MPIU_INT, po->j, p_oth->j, MPI_REPLACE));
5587   nout = 0;
5588   PetscCall(ISGlobalToLocalMappingApply(mapping, IS_GTOLM_DROP, po->i[plocalsize], po->j, &nout, po->j));
5589   PetscCheck(nout == po->i[plocalsize], comm, PETSC_ERR_ARG_INCOMP, "n %" PetscInt_FMT " does not equal to nout %" PetscInt_FMT " ", po->i[plocalsize], nout);
5590   PetscCall(ISLocalToGlobalMappingDestroy(&mapping));
5591   /* Exchange values */
5592   PetscCall(PetscSFBcastEnd(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5593   PetscCall(PetscSFBcastEnd(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5594   PetscCall(MatSeqAIJRestoreArrayRead(p->A, &pd_a));
5595   PetscCall(MatSeqAIJRestoreArrayRead(p->B, &po_a));
5596   /* Stop PETSc from shrinking memory */
5597   for (i = 0; i < nrows; i++) p_oth->ilen[i] = p_oth->imax[i];
5598   PetscCall(MatAssemblyBegin(*P_oth, MAT_FINAL_ASSEMBLY));
5599   PetscCall(MatAssemblyEnd(*P_oth, MAT_FINAL_ASSEMBLY));
5600   /* Attach PetscSF objects to P_oth so that we can reuse it later */
5601   PetscCall(PetscObjectCompose((PetscObject)*P_oth, "diagsf", (PetscObject)sf));
5602   PetscCall(PetscObjectCompose((PetscObject)*P_oth, "offdiagsf", (PetscObject)osf));
5603   PetscCall(PetscSFDestroy(&sf));
5604   PetscCall(PetscSFDestroy(&osf));
5605   PetscFunctionReturn(PETSC_SUCCESS);
5606 }
5607 
5608 /*
5609  * Creates a SeqAIJ matrix by taking rows of B that equal to nonzero columns of local A
5610  * This supports MPIAIJ and MAIJ
5611  * */
5612 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat A, Mat P, PetscInt dof, MatReuse reuse, Mat *P_oth)
5613 {
5614   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
5615   Mat_SeqAIJ *p_oth;
5616   IS          rows, map;
5617   PetscHMapI  hamp;
5618   PetscInt    i, htsize, *rowindices, off, *mapping, key, count;
5619   MPI_Comm    comm;
5620   PetscSF     sf, osf;
5621   PetscBool   has;
5622 
5623   PetscFunctionBegin;
5624   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
5625   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAocols, A, P, 0, 0));
5626   /* If it is the first time, create an index set of off-diag nonzero columns of A,
5627    *  and then create a submatrix (that often is an overlapping matrix)
5628    * */
5629   if (reuse == MAT_INITIAL_MATRIX) {
5630     /* Use a hash table to figure out unique keys */
5631     PetscCall(PetscHMapICreateWithSize(a->B->cmap->n, &hamp));
5632     PetscCall(PetscCalloc1(a->B->cmap->n, &mapping));
5633     count = 0;
5634     /* Assume that  a->g is sorted, otherwise the following does not make sense */
5635     for (i = 0; i < a->B->cmap->n; i++) {
5636       key = a->garray[i] / dof;
5637       PetscCall(PetscHMapIHas(hamp, key, &has));
5638       if (!has) {
5639         mapping[i] = count;
5640         PetscCall(PetscHMapISet(hamp, key, count++));
5641       } else {
5642         /* Current 'i' has the same value the previous step */
5643         mapping[i] = count - 1;
5644       }
5645     }
5646     PetscCall(ISCreateGeneral(comm, a->B->cmap->n, mapping, PETSC_OWN_POINTER, &map));
5647     PetscCall(PetscHMapIGetSize(hamp, &htsize));
5648     PetscCheck(htsize == count, comm, PETSC_ERR_ARG_INCOMP, " Size of hash map %" PetscInt_FMT " is inconsistent with count %" PetscInt_FMT, htsize, count);
5649     PetscCall(PetscCalloc1(htsize, &rowindices));
5650     off = 0;
5651     PetscCall(PetscHMapIGetKeys(hamp, &off, rowindices));
5652     PetscCall(PetscHMapIDestroy(&hamp));
5653     PetscCall(PetscSortInt(htsize, rowindices));
5654     PetscCall(ISCreateGeneral(comm, htsize, rowindices, PETSC_OWN_POINTER, &rows));
5655     /* In case, the matrix was already created but users want to recreate the matrix */
5656     PetscCall(MatDestroy(P_oth));
5657     PetscCall(MatCreateSeqSubMatrixWithRows_Private(P, rows, P_oth));
5658     PetscCall(PetscObjectCompose((PetscObject)*P_oth, "aoffdiagtopothmapping", (PetscObject)map));
5659     PetscCall(ISDestroy(&map));
5660     PetscCall(ISDestroy(&rows));
5661   } else if (reuse == MAT_REUSE_MATRIX) {
5662     /* If matrix was already created, we simply update values using SF objects
5663      * that as attached to the matrix earlier.
5664      */
5665     const PetscScalar *pd_a, *po_a;
5666 
5667     PetscCall(PetscObjectQuery((PetscObject)*P_oth, "diagsf", (PetscObject *)&sf));
5668     PetscCall(PetscObjectQuery((PetscObject)*P_oth, "offdiagsf", (PetscObject *)&osf));
5669     PetscCheck(sf && osf, comm, PETSC_ERR_ARG_NULL, "Matrix is not initialized yet");
5670     p_oth = (Mat_SeqAIJ *)(*P_oth)->data;
5671     /* Update values in place */
5672     PetscCall(MatSeqAIJGetArrayRead(p->A, &pd_a));
5673     PetscCall(MatSeqAIJGetArrayRead(p->B, &po_a));
5674     PetscCall(PetscSFBcastBegin(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5675     PetscCall(PetscSFBcastBegin(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5676     PetscCall(PetscSFBcastEnd(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5677     PetscCall(PetscSFBcastEnd(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5678     PetscCall(MatSeqAIJRestoreArrayRead(p->A, &pd_a));
5679     PetscCall(MatSeqAIJRestoreArrayRead(p->B, &po_a));
5680   } else SETERRQ(comm, PETSC_ERR_ARG_UNKNOWN_TYPE, "Unknown reuse type");
5681   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAocols, A, P, 0, 0));
5682   PetscFunctionReturn(PETSC_SUCCESS);
5683 }
5684 
5685 /*@C
5686   MatGetBrowsOfAcols - Returns `IS` that contain rows of `B` that equal to nonzero columns of local `A`
5687 
5688   Collective
5689 
5690   Input Parameters:
5691 + A     - the first matrix in `MATMPIAIJ` format
5692 . B     - the second matrix in `MATMPIAIJ` format
5693 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5694 
5695   Output Parameters:
5696 + rowb  - On input index sets of rows of B to extract (or `NULL`), modified on output
5697 . colb  - On input index sets of columns of B to extract (or `NULL`), modified on output
5698 - B_seq - the sequential matrix generated
5699 
5700   Level: developer
5701 
5702 .seealso: `Mat`, `MATMPIAIJ`, `IS`, `MatReuse`
5703 @*/
5704 PetscErrorCode MatGetBrowsOfAcols(Mat A, Mat B, MatReuse scall, IS *rowb, IS *colb, Mat *B_seq)
5705 {
5706   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
5707   PetscInt   *idx, i, start, ncols, nzA, nzB, *cmap, imark;
5708   IS          isrowb, iscolb;
5709   Mat        *bseq = NULL;
5710 
5711   PetscFunctionBegin;
5712   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
5713              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
5714   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAcols, A, B, 0, 0));
5715 
5716   if (scall == MAT_INITIAL_MATRIX) {
5717     start = A->cmap->rstart;
5718     cmap  = a->garray;
5719     nzA   = a->A->cmap->n;
5720     nzB   = a->B->cmap->n;
5721     PetscCall(PetscMalloc1(nzA + nzB, &idx));
5722     ncols = 0;
5723     for (i = 0; i < nzB; i++) { /* row < local row index */
5724       if (cmap[i] < start) idx[ncols++] = cmap[i];
5725       else break;
5726     }
5727     imark = i;
5728     for (i = 0; i < nzA; i++) idx[ncols++] = start + i;   /* local rows */
5729     for (i = imark; i < nzB; i++) idx[ncols++] = cmap[i]; /* row > local row index */
5730     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, &isrowb));
5731     PetscCall(ISCreateStride(PETSC_COMM_SELF, B->cmap->N, 0, 1, &iscolb));
5732   } else {
5733     PetscCheck(rowb && colb, PETSC_COMM_SELF, PETSC_ERR_SUP, "IS rowb and colb must be provided for MAT_REUSE_MATRIX");
5734     isrowb = *rowb;
5735     iscolb = *colb;
5736     PetscCall(PetscMalloc1(1, &bseq));
5737     bseq[0] = *B_seq;
5738   }
5739   PetscCall(MatCreateSubMatrices(B, 1, &isrowb, &iscolb, scall, &bseq));
5740   *B_seq = bseq[0];
5741   PetscCall(PetscFree(bseq));
5742   if (!rowb) {
5743     PetscCall(ISDestroy(&isrowb));
5744   } else {
5745     *rowb = isrowb;
5746   }
5747   if (!colb) {
5748     PetscCall(ISDestroy(&iscolb));
5749   } else {
5750     *colb = iscolb;
5751   }
5752   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAcols, A, B, 0, 0));
5753   PetscFunctionReturn(PETSC_SUCCESS);
5754 }
5755 
5756 /*
5757     MatGetBrowsOfAoCols_MPIAIJ - Creates a `MATSEQAIJ` matrix by taking rows of B that equal to nonzero columns
5758     of the OFF-DIAGONAL portion of local A
5759 
5760     Collective
5761 
5762    Input Parameters:
5763 +    A,B - the matrices in `MATMPIAIJ` format
5764 -    scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5765 
5766    Output Parameter:
5767 +    startsj_s - starting point in B's sending j-arrays, saved for MAT_REUSE (or NULL)
5768 .    startsj_r - starting point in B's receiving j-arrays, saved for MAT_REUSE (or NULL)
5769 .    bufa_ptr - array for sending matrix values, saved for MAT_REUSE (or NULL)
5770 -    B_oth - the sequential matrix generated with size aBn=a->B->cmap->n by B->cmap->N
5771 
5772     Developer Note:
5773     This directly accesses information inside the VecScatter associated with the matrix-vector product
5774      for this matrix. This is not desirable..
5775 
5776     Level: developer
5777 
5778 */
5779 
5780 PetscErrorCode MatGetBrowsOfAoCols_MPIAIJ(Mat A, Mat B, MatReuse scall, PetscInt **startsj_s, PetscInt **startsj_r, MatScalar **bufa_ptr, Mat *B_oth)
5781 {
5782   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data;
5783   VecScatter         ctx;
5784   MPI_Comm           comm;
5785   const PetscMPIInt *rprocs, *sprocs;
5786   PetscMPIInt        nrecvs, nsends;
5787   const PetscInt    *srow, *rstarts, *sstarts;
5788   PetscInt          *rowlen, *bufj, *bufJ, ncols = 0, aBn = a->B->cmap->n, row, *b_othi, *b_othj, *rvalues = NULL, *svalues = NULL, *cols, sbs, rbs;
5789   PetscInt           i, j, k = 0, l, ll, nrows, *rstartsj = NULL, *sstartsj, len;
5790   PetscScalar       *b_otha, *bufa, *bufA, *vals = NULL;
5791   MPI_Request       *reqs = NULL, *rwaits = NULL, *swaits = NULL;
5792   PetscMPIInt        size, tag, rank, nreqs;
5793 
5794   PetscFunctionBegin;
5795   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
5796   PetscCallMPI(MPI_Comm_size(comm, &size));
5797 
5798   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
5799              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
5800   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAocols, A, B, 0, 0));
5801   PetscCallMPI(MPI_Comm_rank(comm, &rank));
5802 
5803   if (size == 1) {
5804     startsj_s = NULL;
5805     bufa_ptr  = NULL;
5806     *B_oth    = NULL;
5807     PetscFunctionReturn(PETSC_SUCCESS);
5808   }
5809 
5810   ctx = a->Mvctx;
5811   tag = ((PetscObject)ctx)->tag;
5812 
5813   PetscCall(VecScatterGetRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &srow, &sprocs, &sbs));
5814   /* rprocs[] must be ordered so that indices received from them are ordered in rvalues[], which is key to algorithms used in this subroutine */
5815   PetscCall(VecScatterGetRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL /*indices not needed*/, &rprocs, &rbs));
5816   PetscCall(PetscMPIIntCast(nsends + nrecvs, &nreqs));
5817   PetscCall(PetscMalloc1(nreqs, &reqs));
5818   rwaits = reqs;
5819   swaits = PetscSafePointerPlusOffset(reqs, nrecvs);
5820 
5821   if (!startsj_s || !bufa_ptr) scall = MAT_INITIAL_MATRIX;
5822   if (scall == MAT_INITIAL_MATRIX) {
5823     /* i-array */
5824     /*  post receives */
5825     if (nrecvs) PetscCall(PetscMalloc1(rbs * (rstarts[nrecvs] - rstarts[0]), &rvalues)); /* rstarts can be NULL when nrecvs=0 */
5826     for (i = 0; i < nrecvs; i++) {
5827       rowlen = rvalues + rstarts[i] * rbs;
5828       nrows  = (rstarts[i + 1] - rstarts[i]) * rbs; /* num of indices to be received */
5829       PetscCallMPI(MPIU_Irecv(rowlen, nrows, MPIU_INT, rprocs[i], tag, comm, rwaits + i));
5830     }
5831 
5832     /* pack the outgoing message */
5833     PetscCall(PetscMalloc2(nsends + 1, &sstartsj, nrecvs + 1, &rstartsj));
5834 
5835     sstartsj[0] = 0;
5836     rstartsj[0] = 0;
5837     len         = 0; /* total length of j or a array to be sent */
5838     if (nsends) {
5839       k = sstarts[0]; /* ATTENTION: sstarts[0] and rstarts[0] are not necessarily zero */
5840       PetscCall(PetscMalloc1(sbs * (sstarts[nsends] - sstarts[0]), &svalues));
5841     }
5842     for (i = 0; i < nsends; i++) {
5843       rowlen = svalues + (sstarts[i] - sstarts[0]) * sbs;
5844       nrows  = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5845       for (j = 0; j < nrows; j++) {
5846         row = srow[k] + B->rmap->range[rank]; /* global row idx */
5847         for (l = 0; l < sbs; l++) {
5848           PetscCall(MatGetRow_MPIAIJ(B, row + l, &ncols, NULL, NULL)); /* rowlength */
5849 
5850           rowlen[j * sbs + l] = ncols;
5851 
5852           len += ncols;
5853           PetscCall(MatRestoreRow_MPIAIJ(B, row + l, &ncols, NULL, NULL));
5854         }
5855         k++;
5856       }
5857       PetscCallMPI(MPIU_Isend(rowlen, nrows * sbs, MPIU_INT, sprocs[i], tag, comm, swaits + i));
5858 
5859       sstartsj[i + 1] = len; /* starting point of (i+1)-th outgoing msg in bufj and bufa */
5860     }
5861     /* recvs and sends of i-array are completed */
5862     if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5863     PetscCall(PetscFree(svalues));
5864 
5865     /* allocate buffers for sending j and a arrays */
5866     PetscCall(PetscMalloc1(len + 1, &bufj));
5867     PetscCall(PetscMalloc1(len + 1, &bufa));
5868 
5869     /* create i-array of B_oth */
5870     PetscCall(PetscMalloc1(aBn + 2, &b_othi));
5871 
5872     b_othi[0] = 0;
5873     len       = 0; /* total length of j or a array to be received */
5874     k         = 0;
5875     for (i = 0; i < nrecvs; i++) {
5876       rowlen = rvalues + (rstarts[i] - rstarts[0]) * rbs;
5877       nrows  = (rstarts[i + 1] - rstarts[i]) * rbs; /* num of rows to be received */
5878       for (j = 0; j < nrows; j++) {
5879         b_othi[k + 1] = b_othi[k] + rowlen[j];
5880         PetscCall(PetscIntSumError(rowlen[j], len, &len));
5881         k++;
5882       }
5883       rstartsj[i + 1] = len; /* starting point of (i+1)-th incoming msg in bufj and bufa */
5884     }
5885     PetscCall(PetscFree(rvalues));
5886 
5887     /* allocate space for j and a arrays of B_oth */
5888     PetscCall(PetscMalloc1(b_othi[aBn] + 1, &b_othj));
5889     PetscCall(PetscMalloc1(b_othi[aBn] + 1, &b_otha));
5890 
5891     /* j-array */
5892     /*  post receives of j-array */
5893     for (i = 0; i < nrecvs; i++) {
5894       nrows = rstartsj[i + 1] - rstartsj[i]; /* length of the msg received */
5895       PetscCallMPI(MPIU_Irecv(b_othj + rstartsj[i], nrows, MPIU_INT, rprocs[i], tag, comm, rwaits + i));
5896     }
5897 
5898     /* pack the outgoing message j-array */
5899     if (nsends) k = sstarts[0];
5900     for (i = 0; i < nsends; i++) {
5901       nrows = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5902       bufJ  = bufj + sstartsj[i];
5903       for (j = 0; j < nrows; j++) {
5904         row = srow[k++] + B->rmap->range[rank]; /* global row idx */
5905         for (ll = 0; ll < sbs; ll++) {
5906           PetscCall(MatGetRow_MPIAIJ(B, row + ll, &ncols, &cols, NULL));
5907           for (l = 0; l < ncols; l++) *bufJ++ = cols[l];
5908           PetscCall(MatRestoreRow_MPIAIJ(B, row + ll, &ncols, &cols, NULL));
5909         }
5910       }
5911       PetscCallMPI(MPIU_Isend(bufj + sstartsj[i], sstartsj[i + 1] - sstartsj[i], MPIU_INT, sprocs[i], tag, comm, swaits + i));
5912     }
5913 
5914     /* recvs and sends of j-array are completed */
5915     if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5916   } else if (scall == MAT_REUSE_MATRIX) {
5917     sstartsj = *startsj_s;
5918     rstartsj = *startsj_r;
5919     bufa     = *bufa_ptr;
5920     PetscCall(MatSeqAIJGetArrayWrite(*B_oth, &b_otha));
5921   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not possess an object container");
5922 
5923   /* a-array */
5924   /*  post receives of a-array */
5925   for (i = 0; i < nrecvs; i++) {
5926     nrows = rstartsj[i + 1] - rstartsj[i]; /* length of the msg received */
5927     PetscCallMPI(MPIU_Irecv(b_otha + rstartsj[i], nrows, MPIU_SCALAR, rprocs[i], tag, comm, rwaits + i));
5928   }
5929 
5930   /* pack the outgoing message a-array */
5931   if (nsends) k = sstarts[0];
5932   for (i = 0; i < nsends; i++) {
5933     nrows = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5934     bufA  = bufa + sstartsj[i];
5935     for (j = 0; j < nrows; j++) {
5936       row = srow[k++] + B->rmap->range[rank]; /* global row idx */
5937       for (ll = 0; ll < sbs; ll++) {
5938         PetscCall(MatGetRow_MPIAIJ(B, row + ll, &ncols, NULL, &vals));
5939         for (l = 0; l < ncols; l++) *bufA++ = vals[l];
5940         PetscCall(MatRestoreRow_MPIAIJ(B, row + ll, &ncols, NULL, &vals));
5941       }
5942     }
5943     PetscCallMPI(MPIU_Isend(bufa + sstartsj[i], sstartsj[i + 1] - sstartsj[i], MPIU_SCALAR, sprocs[i], tag, comm, swaits + i));
5944   }
5945   /* recvs and sends of a-array are completed */
5946   if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5947   PetscCall(PetscFree(reqs));
5948 
5949   if (scall == MAT_INITIAL_MATRIX) {
5950     Mat_SeqAIJ *b_oth;
5951 
5952     /* put together the new matrix */
5953     PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, aBn, B->cmap->N, b_othi, b_othj, b_otha, B_oth));
5954 
5955     /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5956     /* Since these are PETSc arrays, change flags to free them as necessary. */
5957     b_oth          = (Mat_SeqAIJ *)(*B_oth)->data;
5958     b_oth->free_a  = PETSC_TRUE;
5959     b_oth->free_ij = PETSC_TRUE;
5960     b_oth->nonew   = 0;
5961 
5962     PetscCall(PetscFree(bufj));
5963     if (!startsj_s || !bufa_ptr) {
5964       PetscCall(PetscFree2(sstartsj, rstartsj));
5965       PetscCall(PetscFree(bufa_ptr));
5966     } else {
5967       *startsj_s = sstartsj;
5968       *startsj_r = rstartsj;
5969       *bufa_ptr  = bufa;
5970     }
5971   } else if (scall == MAT_REUSE_MATRIX) {
5972     PetscCall(MatSeqAIJRestoreArrayWrite(*B_oth, &b_otha));
5973   }
5974 
5975   PetscCall(VecScatterRestoreRemote_Private(ctx, PETSC_TRUE, &nsends, &sstarts, &srow, &sprocs, &sbs));
5976   PetscCall(VecScatterRestoreRemoteOrdered_Private(ctx, PETSC_FALSE, &nrecvs, &rstarts, NULL, &rprocs, &rbs));
5977   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAocols, A, B, 0, 0));
5978   PetscFunctionReturn(PETSC_SUCCESS);
5979 }
5980 
5981 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCRL(Mat, MatType, MatReuse, Mat *);
5982 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJPERM(Mat, MatType, MatReuse, Mat *);
5983 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJSELL(Mat, MatType, MatReuse, Mat *);
5984 #if defined(PETSC_HAVE_MKL_SPARSE)
5985 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJMKL(Mat, MatType, MatReuse, Mat *);
5986 #endif
5987 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIBAIJ(Mat, MatType, MatReuse, Mat *);
5988 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPISBAIJ(Mat, MatType, MatReuse, Mat *);
5989 #if defined(PETSC_HAVE_ELEMENTAL)
5990 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_Elemental(Mat, MatType, MatReuse, Mat *);
5991 #endif
5992 #if defined(PETSC_HAVE_SCALAPACK)
5993 PETSC_INTERN PetscErrorCode MatConvert_AIJ_ScaLAPACK(Mat, MatType, MatReuse, Mat *);
5994 #endif
5995 #if defined(PETSC_HAVE_HYPRE)
5996 PETSC_INTERN PetscErrorCode MatConvert_AIJ_HYPRE(Mat, MatType, MatReuse, Mat *);
5997 #endif
5998 #if defined(PETSC_HAVE_CUDA)
5999 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
6000 #endif
6001 #if defined(PETSC_HAVE_HIP)
6002 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
6003 #endif
6004 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
6005 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat, MatType, MatReuse, Mat *);
6006 #endif
6007 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPISELL(Mat, MatType, MatReuse, Mat *);
6008 PETSC_INTERN PetscErrorCode MatConvert_XAIJ_IS(Mat, MatType, MatReuse, Mat *);
6009 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_IS_XAIJ(Mat);
6010 
6011 /*
6012     Computes (B'*A')' since computing B*A directly is untenable
6013 
6014                n                       p                          p
6015         [             ]       [             ]         [                 ]
6016       m [      A      ]  *  n [       B     ]   =   m [         C       ]
6017         [             ]       [             ]         [                 ]
6018 
6019 */
6020 static PetscErrorCode MatMatMultNumeric_MPIDense_MPIAIJ(Mat A, Mat B, Mat C)
6021 {
6022   Mat At, Bt, Ct;
6023 
6024   PetscFunctionBegin;
6025   PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
6026   PetscCall(MatTranspose(B, MAT_INITIAL_MATRIX, &Bt));
6027   PetscCall(MatMatMult(Bt, At, MAT_INITIAL_MATRIX, PETSC_CURRENT, &Ct));
6028   PetscCall(MatDestroy(&At));
6029   PetscCall(MatDestroy(&Bt));
6030   PetscCall(MatTransposeSetPrecursor(Ct, C));
6031   PetscCall(MatTranspose(Ct, MAT_REUSE_MATRIX, &C));
6032   PetscCall(MatDestroy(&Ct));
6033   PetscFunctionReturn(PETSC_SUCCESS);
6034 }
6035 
6036 static PetscErrorCode MatMatMultSymbolic_MPIDense_MPIAIJ(Mat A, Mat B, PetscReal fill, Mat C)
6037 {
6038   PetscBool cisdense;
6039 
6040   PetscFunctionBegin;
6041   PetscCheck(A->cmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "A->cmap->n %" PetscInt_FMT " != B->rmap->n %" PetscInt_FMT, A->cmap->n, B->rmap->n);
6042   PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
6043   PetscCall(MatSetBlockSizesFromMats(C, A, B));
6044   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATMPIDENSE, MATMPIDENSECUDA, MATMPIDENSEHIP, ""));
6045   if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
6046   PetscCall(MatSetUp(C));
6047 
6048   C->ops->matmultnumeric = MatMatMultNumeric_MPIDense_MPIAIJ;
6049   PetscFunctionReturn(PETSC_SUCCESS);
6050 }
6051 
6052 static PetscErrorCode MatProductSetFromOptions_MPIDense_MPIAIJ_AB(Mat C)
6053 {
6054   Mat_Product *product = C->product;
6055   Mat          A = product->A, B = product->B;
6056 
6057   PetscFunctionBegin;
6058   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
6059              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
6060   C->ops->matmultsymbolic = MatMatMultSymbolic_MPIDense_MPIAIJ;
6061   C->ops->productsymbolic = MatProductSymbolic_AB;
6062   PetscFunctionReturn(PETSC_SUCCESS);
6063 }
6064 
6065 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIDense_MPIAIJ(Mat C)
6066 {
6067   Mat_Product *product = C->product;
6068 
6069   PetscFunctionBegin;
6070   if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_MPIDense_MPIAIJ_AB(C));
6071   PetscFunctionReturn(PETSC_SUCCESS);
6072 }
6073 
6074 /*
6075    Merge two sets of sorted nonzeros and return a CSR for the merged (sequential) matrix
6076 
6077   Input Parameters:
6078 
6079     j1,rowBegin1,rowEnd1,jmap1: describe the first set of nonzeros (Set1)
6080     j2,rowBegin2,rowEnd2,jmap2: describe the second set of nonzeros (Set2)
6081 
6082     mat: both sets' nonzeros are on m rows, where m is the number of local rows of the matrix mat
6083 
6084     For Set1, j1[] contains column indices of the nonzeros.
6085     For the k-th row (0<=k<m), [rowBegin1[k],rowEnd1[k]) index into j1[] and point to the begin/end nonzero in row k
6086     respectively (note rowEnd1[k] is not necessarily equal to rwoBegin1[k+1]). Indices in this range of j1[] are sorted,
6087     but might have repeats. jmap1[t+1] - jmap1[t] is the number of repeats for the t-th unique nonzero in Set1.
6088 
6089     Similar for Set2.
6090 
6091     This routine merges the two sets of nonzeros row by row and removes repeats.
6092 
6093   Output Parameters: (memory is allocated by the caller)
6094 
6095     i[],j[]: the CSR of the merged matrix, which has m rows.
6096     imap1[]: the k-th unique nonzero in Set1 (k=0,1,...) corresponds to imap1[k]-th unique nonzero in the merged matrix.
6097     imap2[]: similar to imap1[], but for Set2.
6098     Note we order nonzeros row-by-row and from left to right.
6099 */
6100 static PetscErrorCode MatMergeEntries_Internal(Mat mat, const PetscInt j1[], const PetscInt j2[], const PetscCount rowBegin1[], const PetscCount rowEnd1[], const PetscCount rowBegin2[], const PetscCount rowEnd2[], const PetscCount jmap1[], const PetscCount jmap2[], PetscCount imap1[], PetscCount imap2[], PetscInt i[], PetscInt j[])
6101 {
6102   PetscInt   r, m; /* Row index of mat */
6103   PetscCount t, t1, t2, b1, e1, b2, e2;
6104 
6105   PetscFunctionBegin;
6106   PetscCall(MatGetLocalSize(mat, &m, NULL));
6107   t1 = t2 = t = 0; /* Count unique nonzeros of in Set1, Set1 and the merged respectively */
6108   i[0]        = 0;
6109   for (r = 0; r < m; r++) { /* Do row by row merging */
6110     b1 = rowBegin1[r];
6111     e1 = rowEnd1[r];
6112     b2 = rowBegin2[r];
6113     e2 = rowEnd2[r];
6114     while (b1 < e1 && b2 < e2) {
6115       if (j1[b1] == j2[b2]) { /* Same column index and hence same nonzero */
6116         j[t]      = j1[b1];
6117         imap1[t1] = t;
6118         imap2[t2] = t;
6119         b1 += jmap1[t1 + 1] - jmap1[t1]; /* Jump to next unique local nonzero */
6120         b2 += jmap2[t2 + 1] - jmap2[t2]; /* Jump to next unique remote nonzero */
6121         t1++;
6122         t2++;
6123         t++;
6124       } else if (j1[b1] < j2[b2]) {
6125         j[t]      = j1[b1];
6126         imap1[t1] = t;
6127         b1 += jmap1[t1 + 1] - jmap1[t1];
6128         t1++;
6129         t++;
6130       } else {
6131         j[t]      = j2[b2];
6132         imap2[t2] = t;
6133         b2 += jmap2[t2 + 1] - jmap2[t2];
6134         t2++;
6135         t++;
6136       }
6137     }
6138     /* Merge the remaining in either j1[] or j2[] */
6139     while (b1 < e1) {
6140       j[t]      = j1[b1];
6141       imap1[t1] = t;
6142       b1 += jmap1[t1 + 1] - jmap1[t1];
6143       t1++;
6144       t++;
6145     }
6146     while (b2 < e2) {
6147       j[t]      = j2[b2];
6148       imap2[t2] = t;
6149       b2 += jmap2[t2 + 1] - jmap2[t2];
6150       t2++;
6151       t++;
6152     }
6153     PetscCall(PetscIntCast(t, i + r + 1));
6154   }
6155   PetscFunctionReturn(PETSC_SUCCESS);
6156 }
6157 
6158 /*
6159   Split nonzeros in a block of local rows into two subsets: those in the diagonal block and those in the off-diagonal block
6160 
6161   Input Parameters:
6162     mat: an MPI matrix that provides row and column layout information for splitting. Let's say its number of local rows is m.
6163     n,i[],j[],perm[]: there are n input entries, belonging to m rows. Row/col indices of the entries are stored in i[] and j[]
6164       respectively, along with a permutation array perm[]. Length of the i[],j[],perm[] arrays is n.
6165 
6166       i[] is already sorted, but within a row, j[] is not sorted and might have repeats.
6167       i[] might contain negative indices at the beginning, which means the corresponding entries should be ignored in the splitting.
6168 
6169   Output Parameters:
6170     j[],perm[]: the routine needs to sort j[] within each row along with perm[].
6171     rowBegin[],rowMid[],rowEnd[]: of length m, and the memory is preallocated and zeroed by the caller.
6172       They contain indices pointing to j[]. For 0<=r<m, [rowBegin[r],rowMid[r]) point to begin/end entries of row r of the diagonal block,
6173       and [rowMid[r],rowEnd[r]) point to begin/end entries of row r of the off-diagonal block.
6174 
6175     Aperm[],Ajmap[],Atot,Annz: Arrays are allocated by this routine.
6176       Atot: number of entries belonging to the diagonal block.
6177       Annz: number of unique nonzeros belonging to the diagonal block.
6178       Aperm[Atot] stores values from perm[] for entries belonging to the diagonal block. Length of Aperm[] is Atot, though it may also count
6179         repeats (i.e., same 'i,j' pair).
6180       Ajmap[Annz+1] stores the number of repeats of each unique entry belonging to the diagonal block. More precisely, Ajmap[t+1] - Ajmap[t]
6181         is the number of repeats for the t-th unique entry in the diagonal block. Ajmap[0] is always 0.
6182 
6183       Atot: number of entries belonging to the diagonal block
6184       Annz: number of unique nonzeros belonging to the diagonal block.
6185 
6186     Bperm[], Bjmap[], Btot, Bnnz are similar but for the off-diagonal block.
6187 
6188     Aperm[],Bperm[],Ajmap[] and Bjmap[] are allocated separately by this routine with PetscMalloc1().
6189 */
6190 static PetscErrorCode MatSplitEntries_Internal(Mat mat, PetscCount n, const PetscInt i[], PetscInt j[], PetscCount perm[], PetscCount rowBegin[], PetscCount rowMid[], PetscCount rowEnd[], PetscCount *Atot_, PetscCount **Aperm_, PetscCount *Annz_, PetscCount **Ajmap_, PetscCount *Btot_, PetscCount **Bperm_, PetscCount *Bnnz_, PetscCount **Bjmap_)
6191 {
6192   PetscInt    cstart, cend, rstart, rend, row, col;
6193   PetscCount  Atot = 0, Btot = 0; /* Total number of nonzeros in the diagonal and off-diagonal blocks */
6194   PetscCount  Annz = 0, Bnnz = 0; /* Number of unique nonzeros in the diagonal and off-diagonal blocks */
6195   PetscCount  k, m, p, q, r, s, mid;
6196   PetscCount *Aperm, *Bperm, *Ajmap, *Bjmap;
6197 
6198   PetscFunctionBegin;
6199   PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
6200   PetscCall(PetscLayoutGetRange(mat->cmap, &cstart, &cend));
6201   m = rend - rstart;
6202 
6203   /* Skip negative rows */
6204   for (k = 0; k < n; k++)
6205     if (i[k] >= 0) break;
6206 
6207   /* Process [k,n): sort and partition each local row into diag and offdiag portions,
6208      fill rowBegin[], rowMid[], rowEnd[], and count Atot, Btot, Annz, Bnnz.
6209   */
6210   while (k < n) {
6211     row = i[k];
6212     /* Entries in [k,s) are in one row. Shift diagonal block col indices so that diag is ahead of offdiag after sorting the row */
6213     for (s = k; s < n; s++)
6214       if (i[s] != row) break;
6215 
6216     /* Shift diag columns to range of [-PETSC_INT_MAX, -1] */
6217     for (p = k; p < s; p++) {
6218       if (j[p] >= cstart && j[p] < cend) j[p] -= PETSC_INT_MAX;
6219     }
6220     PetscCall(PetscSortIntWithCountArray(s - k, j + k, perm + k));
6221     PetscCall(PetscSortedIntUpperBound(j, k, s, -1, &mid)); /* Separate [k,s) into [k,mid) for diag and [mid,s) for offdiag */
6222     rowBegin[row - rstart] = k;
6223     rowMid[row - rstart]   = mid;
6224     rowEnd[row - rstart]   = s;
6225     PetscCheck(k == s || j[s - 1] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column index %" PetscInt_FMT " is >= matrix column size %" PetscInt_FMT, j[s - 1], mat->cmap->N);
6226 
6227     /* Count nonzeros of this diag/offdiag row, which might have repeats */
6228     Atot += mid - k;
6229     Btot += s - mid;
6230 
6231     /* Count unique nonzeros of this diag row */
6232     for (p = k; p < mid;) {
6233       col = j[p];
6234       do {
6235         j[p] += PETSC_INT_MAX; /* Revert the modified diagonal indices */
6236         p++;
6237       } while (p < mid && j[p] == col);
6238       Annz++;
6239     }
6240 
6241     /* Count unique nonzeros of this offdiag row */
6242     for (p = mid; p < s;) {
6243       col = j[p];
6244       do {
6245         p++;
6246       } while (p < s && j[p] == col);
6247       Bnnz++;
6248     }
6249     k = s;
6250   }
6251 
6252   /* Allocation according to Atot, Btot, Annz, Bnnz */
6253   PetscCall(PetscMalloc1(Atot, &Aperm));
6254   PetscCall(PetscMalloc1(Btot, &Bperm));
6255   PetscCall(PetscMalloc1(Annz + 1, &Ajmap));
6256   PetscCall(PetscMalloc1(Bnnz + 1, &Bjmap));
6257 
6258   /* Re-scan indices and copy diag/offdiag permutation indices to Aperm, Bperm and also fill Ajmap and Bjmap */
6259   Ajmap[0] = Bjmap[0] = Atot = Btot = Annz = Bnnz = 0;
6260   for (r = 0; r < m; r++) {
6261     k   = rowBegin[r];
6262     mid = rowMid[r];
6263     s   = rowEnd[r];
6264     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(Aperm, Atot), PetscSafePointerPlusOffset(perm, k), mid - k));
6265     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(Bperm, Btot), PetscSafePointerPlusOffset(perm, mid), s - mid));
6266     Atot += mid - k;
6267     Btot += s - mid;
6268 
6269     /* Scan column indices in this row and find out how many repeats each unique nonzero has */
6270     for (p = k; p < mid;) {
6271       col = j[p];
6272       q   = p;
6273       do {
6274         p++;
6275       } while (p < mid && j[p] == col);
6276       Ajmap[Annz + 1] = Ajmap[Annz] + (p - q);
6277       Annz++;
6278     }
6279 
6280     for (p = mid; p < s;) {
6281       col = j[p];
6282       q   = p;
6283       do {
6284         p++;
6285       } while (p < s && j[p] == col);
6286       Bjmap[Bnnz + 1] = Bjmap[Bnnz] + (p - q);
6287       Bnnz++;
6288     }
6289   }
6290   /* Output */
6291   *Aperm_ = Aperm;
6292   *Annz_  = Annz;
6293   *Atot_  = Atot;
6294   *Ajmap_ = Ajmap;
6295   *Bperm_ = Bperm;
6296   *Bnnz_  = Bnnz;
6297   *Btot_  = Btot;
6298   *Bjmap_ = Bjmap;
6299   PetscFunctionReturn(PETSC_SUCCESS);
6300 }
6301 
6302 /*
6303   Expand the jmap[] array to make a new one in view of nonzeros in the merged matrix
6304 
6305   Input Parameters:
6306     nnz1: number of unique nonzeros in a set that was used to produce imap[], jmap[]
6307     nnz:  number of unique nonzeros in the merged matrix
6308     imap[nnz1]: i-th nonzero in the set is the imap[i]-th nonzero in the merged matrix
6309     jmap[nnz1+1]: i-th nonzero in the set has jmap[i+1] - jmap[i] repeats in the set
6310 
6311   Output Parameter: (memory is allocated by the caller)
6312     jmap_new[nnz+1]: i-th nonzero in the merged matrix has jmap_new[i+1] - jmap_new[i] repeats in the set
6313 
6314   Example:
6315     nnz1 = 4
6316     nnz  = 6
6317     imap = [1,3,4,5]
6318     jmap = [0,3,5,6,7]
6319    then,
6320     jmap_new = [0,0,3,3,5,6,7]
6321 */
6322 static PetscErrorCode ExpandJmap_Internal(PetscCount nnz1, PetscCount nnz, const PetscCount imap[], const PetscCount jmap[], PetscCount jmap_new[])
6323 {
6324   PetscCount k, p;
6325 
6326   PetscFunctionBegin;
6327   jmap_new[0] = 0;
6328   p           = nnz;                /* p loops over jmap_new[] backwards */
6329   for (k = nnz1 - 1; k >= 0; k--) { /* k loops over imap[] */
6330     for (; p > imap[k]; p--) jmap_new[p] = jmap[k + 1];
6331   }
6332   for (; p >= 0; p--) jmap_new[p] = jmap[0];
6333   PetscFunctionReturn(PETSC_SUCCESS);
6334 }
6335 
6336 static PetscErrorCode MatCOOStructDestroy_MPIAIJ(void **data)
6337 {
6338   MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)*data;
6339 
6340   PetscFunctionBegin;
6341   PetscCall(PetscSFDestroy(&coo->sf));
6342   PetscCall(PetscFree(coo->Aperm1));
6343   PetscCall(PetscFree(coo->Bperm1));
6344   PetscCall(PetscFree(coo->Ajmap1));
6345   PetscCall(PetscFree(coo->Bjmap1));
6346   PetscCall(PetscFree(coo->Aimap2));
6347   PetscCall(PetscFree(coo->Bimap2));
6348   PetscCall(PetscFree(coo->Aperm2));
6349   PetscCall(PetscFree(coo->Bperm2));
6350   PetscCall(PetscFree(coo->Ajmap2));
6351   PetscCall(PetscFree(coo->Bjmap2));
6352   PetscCall(PetscFree(coo->Cperm1));
6353   PetscCall(PetscFree2(coo->sendbuf, coo->recvbuf));
6354   PetscCall(PetscFree(coo));
6355   PetscFunctionReturn(PETSC_SUCCESS);
6356 }
6357 
6358 PetscErrorCode MatSetPreallocationCOO_MPIAIJ(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
6359 {
6360   MPI_Comm             comm;
6361   PetscMPIInt          rank, size;
6362   PetscInt             m, n, M, N, rstart, rend, cstart, cend; /* Sizes, indices of row/col, therefore with type PetscInt */
6363   PetscCount           k, p, q, rem;                           /* Loop variables over coo arrays */
6364   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
6365   PetscContainer       container;
6366   MatCOOStruct_MPIAIJ *coo;
6367 
6368   PetscFunctionBegin;
6369   PetscCall(PetscFree(mpiaij->garray));
6370   PetscCall(VecDestroy(&mpiaij->lvec));
6371 #if defined(PETSC_USE_CTABLE)
6372   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
6373 #else
6374   PetscCall(PetscFree(mpiaij->colmap));
6375 #endif
6376   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
6377   mat->assembled     = PETSC_FALSE;
6378   mat->was_assembled = PETSC_FALSE;
6379 
6380   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
6381   PetscCallMPI(MPI_Comm_size(comm, &size));
6382   PetscCallMPI(MPI_Comm_rank(comm, &rank));
6383   PetscCall(PetscLayoutSetUp(mat->rmap));
6384   PetscCall(PetscLayoutSetUp(mat->cmap));
6385   PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
6386   PetscCall(PetscLayoutGetRange(mat->cmap, &cstart, &cend));
6387   PetscCall(MatGetLocalSize(mat, &m, &n));
6388   PetscCall(MatGetSize(mat, &M, &N));
6389 
6390   /* Sort (i,j) by row along with a permutation array, so that the to-be-ignored */
6391   /* entries come first, then local rows, then remote rows.                     */
6392   PetscCount n1 = coo_n, *perm1;
6393   PetscInt  *i1 = coo_i, *j1 = coo_j;
6394 
6395   PetscCall(PetscMalloc1(n1, &perm1));
6396   for (k = 0; k < n1; k++) perm1[k] = k;
6397 
6398   /* Manipulate indices so that entries with negative row or col indices will have smallest
6399      row indices, local entries will have greater but negative row indices, and remote entries
6400      will have positive row indices.
6401   */
6402   for (k = 0; k < n1; k++) {
6403     if (i1[k] < 0 || j1[k] < 0) i1[k] = PETSC_INT_MIN;                /* e.g., -2^31, minimal to move them ahead */
6404     else if (i1[k] >= rstart && i1[k] < rend) i1[k] -= PETSC_INT_MAX; /* e.g., minus 2^31-1 to shift local rows to range of [-PETSC_INT_MAX, -1] */
6405     else {
6406       PetscCheck(!mat->nooffprocentries, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "MAT_NO_OFF_PROC_ENTRIES is set but insert to remote rows");
6407       if (mpiaij->donotstash) i1[k] = PETSC_INT_MIN; /* Ignore offproc entries as if they had negative indices */
6408     }
6409   }
6410 
6411   /* Sort by row; after that, [0,k) have ignored entries, [k,rem) have local rows and [rem,n1) have remote rows */
6412   PetscCall(PetscSortIntWithIntCountArrayPair(n1, i1, j1, perm1));
6413 
6414   /* Advance k to the first entry we need to take care of */
6415   for (k = 0; k < n1; k++)
6416     if (i1[k] > PETSC_INT_MIN) break;
6417   PetscCount i1start = k;
6418 
6419   PetscCall(PetscSortedIntUpperBound(i1, k, n1, rend - 1 - PETSC_INT_MAX, &rem)); /* rem is upper bound of the last local row */
6420   for (; k < rem; k++) i1[k] += PETSC_INT_MAX;                                    /* Revert row indices of local rows*/
6421 
6422   PetscCheck(i1 == NULL || i1[n1 - 1] < M, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "COO row index %" PetscInt_FMT " is >= the matrix row size %" PetscInt_FMT, i1[n1 - 1], M);
6423 
6424   /*           Send remote rows to their owner                                  */
6425   /* Find which rows should be sent to which remote ranks*/
6426   PetscInt        nsend = 0; /* Number of MPI ranks to send data to */
6427   PetscMPIInt    *sendto;    /* [nsend], storing remote ranks */
6428   PetscInt       *nentries;  /* [nsend], storing number of entries sent to remote ranks; Assume PetscInt is big enough for this count, and error if not */
6429   const PetscInt *ranges;
6430   PetscInt        maxNsend = size >= 128 ? 128 : size; /* Assume max 128 neighbors; realloc when needed */
6431 
6432   PetscCall(PetscLayoutGetRanges(mat->rmap, &ranges));
6433   PetscCall(PetscMalloc2(maxNsend, &sendto, maxNsend, &nentries));
6434   for (k = rem; k < n1;) {
6435     PetscMPIInt owner;
6436     PetscInt    firstRow, lastRow;
6437 
6438     /* Locate a row range */
6439     firstRow = i1[k]; /* first row of this owner */
6440     PetscCall(PetscLayoutFindOwner(mat->rmap, firstRow, &owner));
6441     lastRow = ranges[owner + 1] - 1; /* last row of this owner */
6442 
6443     /* Find the first index 'p' in [k,n) with i1[p] belonging to next owner */
6444     PetscCall(PetscSortedIntUpperBound(i1, k, n1, lastRow, &p));
6445 
6446     /* All entries in [k,p) belong to this remote owner */
6447     if (nsend >= maxNsend) { /* Double the remote ranks arrays if not long enough */
6448       PetscMPIInt *sendto2;
6449       PetscInt    *nentries2;
6450       PetscInt     maxNsend2 = (maxNsend <= size / 2) ? maxNsend * 2 : size;
6451 
6452       PetscCall(PetscMalloc2(maxNsend2, &sendto2, maxNsend2, &nentries2));
6453       PetscCall(PetscArraycpy(sendto2, sendto, maxNsend));
6454       PetscCall(PetscArraycpy(nentries2, nentries2, maxNsend + 1));
6455       PetscCall(PetscFree2(sendto, nentries2));
6456       sendto   = sendto2;
6457       nentries = nentries2;
6458       maxNsend = maxNsend2;
6459     }
6460     sendto[nsend] = owner;
6461     PetscCall(PetscIntCast(p - k, &nentries[nsend]));
6462     nsend++;
6463     k = p;
6464   }
6465 
6466   /* Build 1st SF to know offsets on remote to send data */
6467   PetscSF      sf1;
6468   PetscInt     nroots = 1, nroots2 = 0;
6469   PetscInt     nleaves = nsend, nleaves2 = 0;
6470   PetscInt    *offsets;
6471   PetscSFNode *iremote;
6472 
6473   PetscCall(PetscSFCreate(comm, &sf1));
6474   PetscCall(PetscMalloc1(nsend, &iremote));
6475   PetscCall(PetscMalloc1(nsend, &offsets));
6476   for (k = 0; k < nsend; k++) {
6477     iremote[k].rank  = sendto[k];
6478     iremote[k].index = 0;
6479     nleaves2 += nentries[k];
6480     PetscCheck(nleaves2 >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Number of SF leaves is too large for PetscInt");
6481   }
6482   PetscCall(PetscSFSetGraph(sf1, nroots, nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
6483   PetscCall(PetscSFFetchAndOpWithMemTypeBegin(sf1, MPIU_INT, PETSC_MEMTYPE_HOST, &nroots2 /*rootdata*/, PETSC_MEMTYPE_HOST, nentries /*leafdata*/, PETSC_MEMTYPE_HOST, offsets /*leafupdate*/, MPI_SUM));
6484   PetscCall(PetscSFFetchAndOpEnd(sf1, MPIU_INT, &nroots2, nentries, offsets, MPI_SUM)); /* Would nroots2 overflow, we check offsets[] below */
6485   PetscCall(PetscSFDestroy(&sf1));
6486   PetscAssert(nleaves2 == n1 - rem, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nleaves2 %" PetscInt_FMT " != number of remote entries %" PetscCount_FMT, nleaves2, n1 - rem);
6487 
6488   /* Build 2nd SF to send remote COOs to their owner */
6489   PetscSF sf2;
6490   nroots  = nroots2;
6491   nleaves = nleaves2;
6492   PetscCall(PetscSFCreate(comm, &sf2));
6493   PetscCall(PetscSFSetFromOptions(sf2));
6494   PetscCall(PetscMalloc1(nleaves, &iremote));
6495   p = 0;
6496   for (k = 0; k < nsend; k++) {
6497     PetscCheck(offsets[k] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Number of SF roots is too large for PetscInt");
6498     for (q = 0; q < nentries[k]; q++, p++) {
6499       iremote[p].rank = sendto[k];
6500       PetscCall(PetscIntCast(offsets[k] + q, &iremote[p].index));
6501     }
6502   }
6503   PetscCall(PetscSFSetGraph(sf2, nroots, nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
6504 
6505   /* Send the remote COOs to their owner */
6506   PetscInt    n2 = nroots, *i2, *j2; /* Buffers for received COOs from other ranks, along with a permutation array */
6507   PetscCount *perm2;                 /* Though PetscInt is enough for remote entries, we use PetscCount here as we want to reuse MatSplitEntries_Internal() */
6508   PetscCall(PetscMalloc3(n2, &i2, n2, &j2, n2, &perm2));
6509   PetscAssert(rem == 0 || i1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6510   PetscAssert(rem == 0 || j1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6511   PetscInt *i1prem = PetscSafePointerPlusOffset(i1, rem);
6512   PetscInt *j1prem = PetscSafePointerPlusOffset(j1, rem);
6513   PetscCall(PetscSFReduceWithMemTypeBegin(sf2, MPIU_INT, PETSC_MEMTYPE_HOST, i1prem, PETSC_MEMTYPE_HOST, i2, MPI_REPLACE));
6514   PetscCall(PetscSFReduceEnd(sf2, MPIU_INT, i1prem, i2, MPI_REPLACE));
6515   PetscCall(PetscSFReduceWithMemTypeBegin(sf2, MPIU_INT, PETSC_MEMTYPE_HOST, j1prem, PETSC_MEMTYPE_HOST, j2, MPI_REPLACE));
6516   PetscCall(PetscSFReduceEnd(sf2, MPIU_INT, j1prem, j2, MPI_REPLACE));
6517 
6518   PetscCall(PetscFree(offsets));
6519   PetscCall(PetscFree2(sendto, nentries));
6520 
6521   /* Sort received COOs by row along with the permutation array     */
6522   for (k = 0; k < n2; k++) perm2[k] = k;
6523   PetscCall(PetscSortIntWithIntCountArrayPair(n2, i2, j2, perm2));
6524 
6525   /* sf2 only sends contiguous leafdata to contiguous rootdata. We record the permutation which will be used to fill leafdata */
6526   PetscCount *Cperm1;
6527   PetscAssert(rem == 0 || perm1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6528   PetscCount *perm1prem = PetscSafePointerPlusOffset(perm1, rem);
6529   PetscCall(PetscMalloc1(nleaves, &Cperm1));
6530   PetscCall(PetscArraycpy(Cperm1, perm1prem, nleaves));
6531 
6532   /* Support for HYPRE matrices, kind of a hack.
6533      Swap min column with diagonal so that diagonal values will go first */
6534   PetscBool hypre;
6535   PetscCall(PetscStrcmp("_internal_COO_mat_for_hypre", ((PetscObject)mat)->name, &hypre));
6536   if (hypre) {
6537     PetscInt *minj;
6538     PetscBT   hasdiag;
6539 
6540     PetscCall(PetscBTCreate(m, &hasdiag));
6541     PetscCall(PetscMalloc1(m, &minj));
6542     for (k = 0; k < m; k++) minj[k] = PETSC_INT_MAX;
6543     for (k = i1start; k < rem; k++) {
6544       if (j1[k] < cstart || j1[k] >= cend) continue;
6545       const PetscInt rindex = i1[k] - rstart;
6546       if ((j1[k] - cstart) == rindex) PetscCall(PetscBTSet(hasdiag, rindex));
6547       minj[rindex] = PetscMin(minj[rindex], j1[k]);
6548     }
6549     for (k = 0; k < n2; k++) {
6550       if (j2[k] < cstart || j2[k] >= cend) continue;
6551       const PetscInt rindex = i2[k] - rstart;
6552       if ((j2[k] - cstart) == rindex) PetscCall(PetscBTSet(hasdiag, rindex));
6553       minj[rindex] = PetscMin(minj[rindex], j2[k]);
6554     }
6555     for (k = i1start; k < rem; k++) {
6556       const PetscInt rindex = i1[k] - rstart;
6557       if (j1[k] < cstart || j1[k] >= cend || !PetscBTLookup(hasdiag, rindex)) continue;
6558       if (j1[k] == minj[rindex]) j1[k] = i1[k] + (cstart - rstart);
6559       else if ((j1[k] - cstart) == rindex) j1[k] = minj[rindex];
6560     }
6561     for (k = 0; k < n2; k++) {
6562       const PetscInt rindex = i2[k] - rstart;
6563       if (j2[k] < cstart || j2[k] >= cend || !PetscBTLookup(hasdiag, rindex)) continue;
6564       if (j2[k] == minj[rindex]) j2[k] = i2[k] + (cstart - rstart);
6565       else if ((j2[k] - cstart) == rindex) j2[k] = minj[rindex];
6566     }
6567     PetscCall(PetscBTDestroy(&hasdiag));
6568     PetscCall(PetscFree(minj));
6569   }
6570 
6571   /* Split local COOs and received COOs into diag/offdiag portions */
6572   PetscCount *rowBegin1, *rowMid1, *rowEnd1;
6573   PetscCount *Ajmap1, *Aperm1, *Bjmap1, *Bperm1;
6574   PetscCount  Annz1, Bnnz1, Atot1, Btot1;
6575   PetscCount *rowBegin2, *rowMid2, *rowEnd2;
6576   PetscCount *Ajmap2, *Aperm2, *Bjmap2, *Bperm2;
6577   PetscCount  Annz2, Bnnz2, Atot2, Btot2;
6578 
6579   PetscCall(PetscCalloc3(m, &rowBegin1, m, &rowMid1, m, &rowEnd1));
6580   PetscCall(PetscCalloc3(m, &rowBegin2, m, &rowMid2, m, &rowEnd2));
6581   PetscCall(MatSplitEntries_Internal(mat, rem, i1, j1, perm1, rowBegin1, rowMid1, rowEnd1, &Atot1, &Aperm1, &Annz1, &Ajmap1, &Btot1, &Bperm1, &Bnnz1, &Bjmap1));
6582   PetscCall(MatSplitEntries_Internal(mat, n2, i2, j2, perm2, rowBegin2, rowMid2, rowEnd2, &Atot2, &Aperm2, &Annz2, &Ajmap2, &Btot2, &Bperm2, &Bnnz2, &Bjmap2));
6583 
6584   /* Merge local COOs with received COOs: diag with diag, offdiag with offdiag */
6585   PetscInt *Ai, *Bi;
6586   PetscInt *Aj, *Bj;
6587 
6588   PetscCall(PetscMalloc1(m + 1, &Ai));
6589   PetscCall(PetscMalloc1(m + 1, &Bi));
6590   PetscCall(PetscMalloc1(Annz1 + Annz2, &Aj)); /* Since local and remote entries might have dups, we might allocate excess memory */
6591   PetscCall(PetscMalloc1(Bnnz1 + Bnnz2, &Bj));
6592 
6593   PetscCount *Aimap1, *Bimap1, *Aimap2, *Bimap2;
6594   PetscCall(PetscMalloc1(Annz1, &Aimap1));
6595   PetscCall(PetscMalloc1(Bnnz1, &Bimap1));
6596   PetscCall(PetscMalloc1(Annz2, &Aimap2));
6597   PetscCall(PetscMalloc1(Bnnz2, &Bimap2));
6598 
6599   PetscCall(MatMergeEntries_Internal(mat, j1, j2, rowBegin1, rowMid1, rowBegin2, rowMid2, Ajmap1, Ajmap2, Aimap1, Aimap2, Ai, Aj));
6600   PetscCall(MatMergeEntries_Internal(mat, j1, j2, rowMid1, rowEnd1, rowMid2, rowEnd2, Bjmap1, Bjmap2, Bimap1, Bimap2, Bi, Bj));
6601 
6602   /* Expand Ajmap1/Bjmap1 to make them based off nonzeros in A/B, since we     */
6603   /* expect nonzeros in A/B most likely have local contributing entries        */
6604   PetscInt    Annz = Ai[m];
6605   PetscInt    Bnnz = Bi[m];
6606   PetscCount *Ajmap1_new, *Bjmap1_new;
6607 
6608   PetscCall(PetscMalloc1(Annz + 1, &Ajmap1_new));
6609   PetscCall(PetscMalloc1(Bnnz + 1, &Bjmap1_new));
6610 
6611   PetscCall(ExpandJmap_Internal(Annz1, Annz, Aimap1, Ajmap1, Ajmap1_new));
6612   PetscCall(ExpandJmap_Internal(Bnnz1, Bnnz, Bimap1, Bjmap1, Bjmap1_new));
6613 
6614   PetscCall(PetscFree(Aimap1));
6615   PetscCall(PetscFree(Ajmap1));
6616   PetscCall(PetscFree(Bimap1));
6617   PetscCall(PetscFree(Bjmap1));
6618   PetscCall(PetscFree3(rowBegin1, rowMid1, rowEnd1));
6619   PetscCall(PetscFree3(rowBegin2, rowMid2, rowEnd2));
6620   PetscCall(PetscFree(perm1));
6621   PetscCall(PetscFree3(i2, j2, perm2));
6622 
6623   Ajmap1 = Ajmap1_new;
6624   Bjmap1 = Bjmap1_new;
6625 
6626   /* Reallocate Aj, Bj once we know actual numbers of unique nonzeros in A and B */
6627   if (Annz < Annz1 + Annz2) {
6628     PetscInt *Aj_new;
6629     PetscCall(PetscMalloc1(Annz, &Aj_new));
6630     PetscCall(PetscArraycpy(Aj_new, Aj, Annz));
6631     PetscCall(PetscFree(Aj));
6632     Aj = Aj_new;
6633   }
6634 
6635   if (Bnnz < Bnnz1 + Bnnz2) {
6636     PetscInt *Bj_new;
6637     PetscCall(PetscMalloc1(Bnnz, &Bj_new));
6638     PetscCall(PetscArraycpy(Bj_new, Bj, Bnnz));
6639     PetscCall(PetscFree(Bj));
6640     Bj = Bj_new;
6641   }
6642 
6643   /* Create new submatrices for on-process and off-process coupling                  */
6644   PetscScalar     *Aa, *Ba;
6645   MatType          rtype;
6646   Mat_SeqAIJ      *a, *b;
6647   PetscObjectState state;
6648   PetscCall(PetscCalloc1(Annz, &Aa)); /* Zero matrix on device */
6649   PetscCall(PetscCalloc1(Bnnz, &Ba));
6650   /* make Aj[] local, i.e, based off the start column of the diagonal portion */
6651   if (cstart) {
6652     for (k = 0; k < Annz; k++) Aj[k] -= cstart;
6653   }
6654 
6655   PetscCall(MatGetRootType_Private(mat, &rtype));
6656 
6657   MatSeqXAIJGetOptions_Private(mpiaij->A);
6658   PetscCall(MatDestroy(&mpiaij->A));
6659   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, n, Ai, Aj, Aa, &mpiaij->A));
6660   PetscCall(MatSetBlockSizesFromMats(mpiaij->A, mat, mat));
6661   MatSeqXAIJRestoreOptions_Private(mpiaij->A);
6662 
6663   MatSeqXAIJGetOptions_Private(mpiaij->B);
6664   PetscCall(MatDestroy(&mpiaij->B));
6665   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, mat->cmap->N, Bi, Bj, Ba, &mpiaij->B));
6666   PetscCall(MatSetBlockSizesFromMats(mpiaij->B, mat, mat));
6667   MatSeqXAIJRestoreOptions_Private(mpiaij->B);
6668 
6669   PetscCall(MatSetUpMultiply_MPIAIJ(mat));
6670   mat->was_assembled = PETSC_TRUE; // was_assembled in effect means the Mvctx is built; doing so avoids redundant MatSetUpMultiply_MPIAIJ
6671   state              = mpiaij->A->nonzerostate + mpiaij->B->nonzerostate;
6672   PetscCallMPI(MPIU_Allreduce(&state, &mat->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)mat)));
6673 
6674   a          = (Mat_SeqAIJ *)mpiaij->A->data;
6675   b          = (Mat_SeqAIJ *)mpiaij->B->data;
6676   a->free_a  = PETSC_TRUE;
6677   a->free_ij = PETSC_TRUE;
6678   b->free_a  = PETSC_TRUE;
6679   b->free_ij = PETSC_TRUE;
6680   a->maxnz   = a->nz;
6681   b->maxnz   = b->nz;
6682 
6683   /* conversion must happen AFTER multiply setup */
6684   PetscCall(MatConvert(mpiaij->A, rtype, MAT_INPLACE_MATRIX, &mpiaij->A));
6685   PetscCall(MatConvert(mpiaij->B, rtype, MAT_INPLACE_MATRIX, &mpiaij->B));
6686   PetscCall(VecDestroy(&mpiaij->lvec));
6687   PetscCall(MatCreateVecs(mpiaij->B, &mpiaij->lvec, NULL));
6688 
6689   // Put the COO struct in a container and then attach that to the matrix
6690   PetscCall(PetscMalloc1(1, &coo));
6691   coo->n       = coo_n;
6692   coo->sf      = sf2;
6693   coo->sendlen = nleaves;
6694   coo->recvlen = nroots;
6695   coo->Annz    = Annz;
6696   coo->Bnnz    = Bnnz;
6697   coo->Annz2   = Annz2;
6698   coo->Bnnz2   = Bnnz2;
6699   coo->Atot1   = Atot1;
6700   coo->Atot2   = Atot2;
6701   coo->Btot1   = Btot1;
6702   coo->Btot2   = Btot2;
6703   coo->Ajmap1  = Ajmap1;
6704   coo->Aperm1  = Aperm1;
6705   coo->Bjmap1  = Bjmap1;
6706   coo->Bperm1  = Bperm1;
6707   coo->Aimap2  = Aimap2;
6708   coo->Ajmap2  = Ajmap2;
6709   coo->Aperm2  = Aperm2;
6710   coo->Bimap2  = Bimap2;
6711   coo->Bjmap2  = Bjmap2;
6712   coo->Bperm2  = Bperm2;
6713   coo->Cperm1  = Cperm1;
6714   // Allocate in preallocation. If not used, it has zero cost on host
6715   PetscCall(PetscMalloc2(coo->sendlen, &coo->sendbuf, coo->recvlen, &coo->recvbuf));
6716   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container));
6717   PetscCall(PetscContainerSetPointer(container, coo));
6718   PetscCall(PetscContainerSetCtxDestroy(container, MatCOOStructDestroy_MPIAIJ));
6719   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject)container));
6720   PetscCall(PetscContainerDestroy(&container));
6721   PetscFunctionReturn(PETSC_SUCCESS);
6722 }
6723 
6724 static PetscErrorCode MatSetValuesCOO_MPIAIJ(Mat mat, const PetscScalar v[], InsertMode imode)
6725 {
6726   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
6727   Mat                  A = mpiaij->A, B = mpiaij->B;
6728   PetscScalar         *Aa, *Ba;
6729   PetscScalar         *sendbuf, *recvbuf;
6730   const PetscCount    *Ajmap1, *Ajmap2, *Aimap2;
6731   const PetscCount    *Bjmap1, *Bjmap2, *Bimap2;
6732   const PetscCount    *Aperm1, *Aperm2, *Bperm1, *Bperm2;
6733   const PetscCount    *Cperm1;
6734   PetscContainer       container;
6735   MatCOOStruct_MPIAIJ *coo;
6736 
6737   PetscFunctionBegin;
6738   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container));
6739   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
6740   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
6741   sendbuf = coo->sendbuf;
6742   recvbuf = coo->recvbuf;
6743   Ajmap1  = coo->Ajmap1;
6744   Ajmap2  = coo->Ajmap2;
6745   Aimap2  = coo->Aimap2;
6746   Bjmap1  = coo->Bjmap1;
6747   Bjmap2  = coo->Bjmap2;
6748   Bimap2  = coo->Bimap2;
6749   Aperm1  = coo->Aperm1;
6750   Aperm2  = coo->Aperm2;
6751   Bperm1  = coo->Bperm1;
6752   Bperm2  = coo->Bperm2;
6753   Cperm1  = coo->Cperm1;
6754 
6755   PetscCall(MatSeqAIJGetArray(A, &Aa)); /* Might read and write matrix values */
6756   PetscCall(MatSeqAIJGetArray(B, &Ba));
6757 
6758   /* Pack entries to be sent to remote */
6759   for (PetscCount i = 0; i < coo->sendlen; i++) sendbuf[i] = v[Cperm1[i]];
6760 
6761   /* Send remote entries to their owner and overlap the communication with local computation */
6762   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_HOST, sendbuf, PETSC_MEMTYPE_HOST, recvbuf, MPI_REPLACE));
6763   /* Add local entries to A and B */
6764   for (PetscCount i = 0; i < coo->Annz; i++) { /* All nonzeros in A are either zero'ed or added with a value (i.e., initialized) */
6765     PetscScalar sum = 0.0;                     /* Do partial summation first to improve numerical stability */
6766     for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += v[Aperm1[k]];
6767     Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
6768   }
6769   for (PetscCount i = 0; i < coo->Bnnz; i++) {
6770     PetscScalar sum = 0.0;
6771     for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += v[Bperm1[k]];
6772     Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
6773   }
6774   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, sendbuf, recvbuf, MPI_REPLACE));
6775 
6776   /* Add received remote entries to A and B */
6777   for (PetscCount i = 0; i < coo->Annz2; i++) {
6778     for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += recvbuf[Aperm2[k]];
6779   }
6780   for (PetscCount i = 0; i < coo->Bnnz2; i++) {
6781     for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += recvbuf[Bperm2[k]];
6782   }
6783   PetscCall(MatSeqAIJRestoreArray(A, &Aa));
6784   PetscCall(MatSeqAIJRestoreArray(B, &Ba));
6785   PetscFunctionReturn(PETSC_SUCCESS);
6786 }
6787 
6788 /*MC
6789    MATMPIAIJ - MATMPIAIJ = "mpiaij" - A matrix type to be used for parallel sparse matrices.
6790 
6791    Options Database Keys:
6792 . -mat_type mpiaij - sets the matrix type to `MATMPIAIJ` during a call to `MatSetFromOptions()`
6793 
6794    Level: beginner
6795 
6796    Notes:
6797    `MatSetValues()` may be called for this matrix type with a `NULL` argument for the numerical values,
6798     in this case the values associated with the rows and columns one passes in are set to zero
6799     in the matrix
6800 
6801     `MatSetOptions`(,`MAT_STRUCTURE_ONLY`,`PETSC_TRUE`) may be called for this matrix type. In this no
6802     space is allocated for the nonzero entries and any entries passed with `MatSetValues()` are ignored
6803 
6804 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJ`, `MATAIJ`, `MatCreateAIJ()`
6805 M*/
6806 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJ(Mat B)
6807 {
6808   Mat_MPIAIJ *b;
6809   PetscMPIInt size;
6810 
6811   PetscFunctionBegin;
6812   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
6813 
6814   PetscCall(PetscNew(&b));
6815   B->data       = (void *)b;
6816   B->ops[0]     = MatOps_Values;
6817   B->assembled  = PETSC_FALSE;
6818   B->insertmode = NOT_SET_VALUES;
6819   b->size       = size;
6820 
6821   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)B), &b->rank));
6822 
6823   /* build cache for off array entries formed */
6824   PetscCall(MatStashCreate_Private(PetscObjectComm((PetscObject)B), 1, &B->stash));
6825 
6826   b->donotstash  = PETSC_FALSE;
6827   b->colmap      = NULL;
6828   b->garray      = NULL;
6829   b->roworiented = PETSC_TRUE;
6830 
6831   /* stuff used for matrix vector multiply */
6832   b->lvec  = NULL;
6833   b->Mvctx = NULL;
6834 
6835   /* stuff for MatGetRow() */
6836   b->rowindices   = NULL;
6837   b->rowvalues    = NULL;
6838   b->getrowactive = PETSC_FALSE;
6839 
6840   /* flexible pointer used in CUSPARSE classes */
6841   b->spptr = NULL;
6842 
6843   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetUseScalableIncreaseOverlap_C", MatMPIAIJSetUseScalableIncreaseOverlap_MPIAIJ));
6844   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_MPIAIJ));
6845   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_MPIAIJ));
6846   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatIsTranspose_C", MatIsTranspose_MPIAIJ));
6847   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJ));
6848   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatResetPreallocation_C", MatResetPreallocation_MPIAIJ));
6849   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatResetHash_C", MatResetHash_MPIAIJ));
6850   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocationCSR_C", MatMPIAIJSetPreallocationCSR_MPIAIJ));
6851   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDiagonalScaleLocal_C", MatDiagonalScaleLocal_MPIAIJ));
6852   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijperm_C", MatConvert_MPIAIJ_MPIAIJPERM));
6853   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijsell_C", MatConvert_MPIAIJ_MPIAIJSELL));
6854 #if defined(PETSC_HAVE_CUDA)
6855   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijcusparse_C", MatConvert_MPIAIJ_MPIAIJCUSPARSE));
6856 #endif
6857 #if defined(PETSC_HAVE_HIP)
6858   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijhipsparse_C", MatConvert_MPIAIJ_MPIAIJHIPSPARSE));
6859 #endif
6860 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
6861   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijkokkos_C", MatConvert_MPIAIJ_MPIAIJKokkos));
6862 #endif
6863 #if defined(PETSC_HAVE_MKL_SPARSE)
6864   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijmkl_C", MatConvert_MPIAIJ_MPIAIJMKL));
6865 #endif
6866   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijcrl_C", MatConvert_MPIAIJ_MPIAIJCRL));
6867   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpibaij_C", MatConvert_MPIAIJ_MPIBAIJ));
6868   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpisbaij_C", MatConvert_MPIAIJ_MPISBAIJ));
6869   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpidense_C", MatConvert_MPIAIJ_MPIDense));
6870 #if defined(PETSC_HAVE_ELEMENTAL)
6871   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_elemental_C", MatConvert_MPIAIJ_Elemental));
6872 #endif
6873 #if defined(PETSC_HAVE_SCALAPACK)
6874   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_scalapack_C", MatConvert_AIJ_ScaLAPACK));
6875 #endif
6876   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_is_C", MatConvert_XAIJ_IS));
6877   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpisell_C", MatConvert_MPIAIJ_MPISELL));
6878 #if defined(PETSC_HAVE_HYPRE)
6879   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_hypre_C", MatConvert_AIJ_HYPRE));
6880   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_transpose_mpiaij_mpiaij_C", MatProductSetFromOptions_Transpose_AIJ_AIJ));
6881 #endif
6882   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_is_mpiaij_C", MatProductSetFromOptions_IS_XAIJ));
6883   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_mpiaij_mpiaij_C", MatProductSetFromOptions_MPIAIJ));
6884   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJ));
6885   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJ));
6886   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATMPIAIJ));
6887   PetscFunctionReturn(PETSC_SUCCESS);
6888 }
6889 
6890 /*@
6891   MatCreateMPIAIJWithSplitArrays - creates a `MATMPIAIJ` matrix using arrays that contain the "diagonal"
6892   and "off-diagonal" part of the matrix in CSR format.
6893 
6894   Collective
6895 
6896   Input Parameters:
6897 + comm - MPI communicator
6898 . m    - number of local rows (Cannot be `PETSC_DECIDE`)
6899 . n    - This value should be the same as the local size used in creating the
6900          x vector for the matrix-vector product $y = Ax$. (or `PETSC_DECIDE` to have
6901          calculated if `N` is given) For square matrices `n` is almost always `m`.
6902 . M    - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
6903 . N    - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
6904 . i    - row indices for "diagonal" portion of matrix; that is i[0] = 0, i[row] = i[row-1] + number of elements in that row of the matrix
6905 . j    - column indices, which must be local, i.e., based off the start column of the diagonal portion
6906 . a    - matrix values
6907 . oi   - row indices for "off-diagonal" portion of matrix; that is oi[0] = 0, oi[row] = oi[row-1] + number of elements in that row of the matrix
6908 . oj   - column indices, which must be global, representing global columns in the `MATMPIAIJ` matrix
6909 - oa   - matrix values
6910 
6911   Output Parameter:
6912 . mat - the matrix
6913 
6914   Level: advanced
6915 
6916   Notes:
6917   The `i`, `j`, and `a` arrays ARE NOT copied by this routine into the internal format used by PETSc (even in Fortran). The user
6918   must free the arrays once the matrix has been destroyed and not before.
6919 
6920   The `i` and `j` indices are 0 based
6921 
6922   See `MatCreateAIJ()` for the definition of "diagonal" and "off-diagonal" portion of the matrix
6923 
6924   This sets local rows and cannot be used to set off-processor values.
6925 
6926   Use of this routine is discouraged because it is inflexible and cumbersome to use. It is extremely rare that a
6927   legacy application natively assembles into exactly this split format. The code to do so is nontrivial and does
6928   not easily support in-place reassembly. It is recommended to use MatSetValues() (or a variant thereof) because
6929   the resulting assembly is easier to implement, will work with any matrix format, and the user does not have to
6930   keep track of the underlying array. Use `MatSetOption`(A,`MAT_NO_OFF_PROC_ENTRIES`,`PETSC_TRUE`) to disable all
6931   communication if it is known that only local entries will be set.
6932 
6933 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
6934           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithArrays()`
6935 @*/
6936 PetscErrorCode MatCreateMPIAIJWithSplitArrays(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt i[], PetscInt j[], PetscScalar a[], PetscInt oi[], PetscInt oj[], PetscScalar oa[], Mat *mat)
6937 {
6938   Mat_MPIAIJ *maij;
6939 
6940   PetscFunctionBegin;
6941   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
6942   PetscCheck(i[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
6943   PetscCheck(oi[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "oi (row indices) must start with 0");
6944   PetscCall(MatCreate(comm, mat));
6945   PetscCall(MatSetSizes(*mat, m, n, M, N));
6946   PetscCall(MatSetType(*mat, MATMPIAIJ));
6947   maij = (Mat_MPIAIJ *)(*mat)->data;
6948 
6949   (*mat)->preallocated = PETSC_TRUE;
6950 
6951   PetscCall(PetscLayoutSetUp((*mat)->rmap));
6952   PetscCall(PetscLayoutSetUp((*mat)->cmap));
6953 
6954   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, n, i, j, a, &maij->A));
6955   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, (*mat)->cmap->N, oi, oj, oa, &maij->B));
6956 
6957   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
6958   PetscCall(MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY));
6959   PetscCall(MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY));
6960   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE));
6961   PetscCall(MatSetOption(*mat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
6962   PetscFunctionReturn(PETSC_SUCCESS);
6963 }
6964 
6965 typedef struct {
6966   Mat       *mp;    /* intermediate products */
6967   PetscBool *mptmp; /* is the intermediate product temporary ? */
6968   PetscInt   cp;    /* number of intermediate products */
6969 
6970   /* support for MatGetBrowsOfAoCols_MPIAIJ for P_oth */
6971   PetscInt    *startsj_s, *startsj_r;
6972   PetscScalar *bufa;
6973   Mat          P_oth;
6974 
6975   /* may take advantage of merging product->B */
6976   Mat Bloc; /* B-local by merging diag and off-diag */
6977 
6978   /* cusparse does not have support to split between symbolic and numeric phases.
6979      When api_user is true, we don't need to update the numerical values
6980      of the temporary storage */
6981   PetscBool reusesym;
6982 
6983   /* support for COO values insertion */
6984   PetscScalar *coo_v, *coo_w; /* store on-process and off-process COO scalars, and used as MPI recv/send buffers respectively */
6985   PetscInt   **own;           /* own[i] points to address of on-process COO indices for Mat mp[i] */
6986   PetscInt   **off;           /* off[i] points to address of off-process COO indices for Mat mp[i] */
6987   PetscBool    hasoffproc;    /* if true, have off-process values insertion (i.e. AtB or PtAP) */
6988   PetscSF      sf;            /* used for non-local values insertion and memory malloc */
6989   PetscMemType mtype;
6990 
6991   /* customization */
6992   PetscBool abmerge;
6993   PetscBool P_oth_bind;
6994 } MatMatMPIAIJBACKEND;
6995 
6996 static PetscErrorCode MatDestroy_MatMatMPIAIJBACKEND(void *data)
6997 {
6998   MatMatMPIAIJBACKEND *mmdata = (MatMatMPIAIJBACKEND *)data;
6999   PetscInt             i;
7000 
7001   PetscFunctionBegin;
7002   PetscCall(PetscFree2(mmdata->startsj_s, mmdata->startsj_r));
7003   PetscCall(PetscFree(mmdata->bufa));
7004   PetscCall(PetscSFFree(mmdata->sf, mmdata->mtype, mmdata->coo_v));
7005   PetscCall(PetscSFFree(mmdata->sf, mmdata->mtype, mmdata->coo_w));
7006   PetscCall(MatDestroy(&mmdata->P_oth));
7007   PetscCall(MatDestroy(&mmdata->Bloc));
7008   PetscCall(PetscSFDestroy(&mmdata->sf));
7009   for (i = 0; i < mmdata->cp; i++) PetscCall(MatDestroy(&mmdata->mp[i]));
7010   PetscCall(PetscFree2(mmdata->mp, mmdata->mptmp));
7011   PetscCall(PetscFree(mmdata->own[0]));
7012   PetscCall(PetscFree(mmdata->own));
7013   PetscCall(PetscFree(mmdata->off[0]));
7014   PetscCall(PetscFree(mmdata->off));
7015   PetscCall(PetscFree(mmdata));
7016   PetscFunctionReturn(PETSC_SUCCESS);
7017 }
7018 
7019 /* Copy selected n entries with indices in idx[] of A to v[].
7020    If idx is NULL, copy the whole data array of A to v[]
7021  */
7022 static PetscErrorCode MatSeqAIJCopySubArray(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
7023 {
7024   PetscErrorCode (*f)(Mat, PetscInt, const PetscInt[], PetscScalar[]);
7025 
7026   PetscFunctionBegin;
7027   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", &f));
7028   if (f) {
7029     PetscCall((*f)(A, n, idx, v));
7030   } else {
7031     const PetscScalar *vv;
7032 
7033     PetscCall(MatSeqAIJGetArrayRead(A, &vv));
7034     if (n && idx) {
7035       PetscScalar    *w  = v;
7036       const PetscInt *oi = idx;
7037       PetscInt        j;
7038 
7039       for (j = 0; j < n; j++) *w++ = vv[*oi++];
7040     } else {
7041       PetscCall(PetscArraycpy(v, vv, n));
7042     }
7043     PetscCall(MatSeqAIJRestoreArrayRead(A, &vv));
7044   }
7045   PetscFunctionReturn(PETSC_SUCCESS);
7046 }
7047 
7048 static PetscErrorCode MatProductNumeric_MPIAIJBACKEND(Mat C)
7049 {
7050   MatMatMPIAIJBACKEND *mmdata;
7051   PetscInt             i, n_d, n_o;
7052 
7053   PetscFunctionBegin;
7054   MatCheckProduct(C, 1);
7055   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
7056   mmdata = (MatMatMPIAIJBACKEND *)C->product->data;
7057   if (!mmdata->reusesym) { /* update temporary matrices */
7058     if (mmdata->P_oth) PetscCall(MatGetBrowsOfAoCols_MPIAIJ(C->product->A, C->product->B, MAT_REUSE_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7059     if (mmdata->Bloc) PetscCall(MatMPIAIJGetLocalMatMerge(C->product->B, MAT_REUSE_MATRIX, NULL, &mmdata->Bloc));
7060   }
7061   mmdata->reusesym = PETSC_FALSE;
7062 
7063   for (i = 0; i < mmdata->cp; i++) {
7064     PetscCheck(mmdata->mp[i]->ops->productnumeric, PetscObjectComm((PetscObject)mmdata->mp[i]), PETSC_ERR_PLIB, "Missing numeric op for %s", MatProductTypes[mmdata->mp[i]->product->type]);
7065     PetscCall((*mmdata->mp[i]->ops->productnumeric)(mmdata->mp[i]));
7066   }
7067   for (i = 0, n_d = 0, n_o = 0; i < mmdata->cp; i++) {
7068     PetscInt noff;
7069 
7070     PetscCall(PetscIntCast(mmdata->off[i + 1] - mmdata->off[i], &noff));
7071     if (mmdata->mptmp[i]) continue;
7072     if (noff) {
7073       PetscInt nown;
7074 
7075       PetscCall(PetscIntCast(mmdata->own[i + 1] - mmdata->own[i], &nown));
7076       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], noff, mmdata->off[i], mmdata->coo_w + n_o));
7077       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], nown, mmdata->own[i], mmdata->coo_v + n_d));
7078       n_o += noff;
7079       n_d += nown;
7080     } else {
7081       Mat_SeqAIJ *mm = (Mat_SeqAIJ *)mmdata->mp[i]->data;
7082 
7083       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], mm->nz, NULL, mmdata->coo_v + n_d));
7084       n_d += mm->nz;
7085     }
7086   }
7087   if (mmdata->hasoffproc) { /* offprocess insertion */
7088     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_SCALAR, mmdata->coo_w, mmdata->coo_v + n_d));
7089     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_SCALAR, mmdata->coo_w, mmdata->coo_v + n_d));
7090   }
7091   PetscCall(MatSetValuesCOO(C, mmdata->coo_v, INSERT_VALUES));
7092   PetscFunctionReturn(PETSC_SUCCESS);
7093 }
7094 
7095 /* Support for Pt * A, A * P, or Pt * A * P */
7096 #define MAX_NUMBER_INTERMEDIATE 4
7097 PetscErrorCode MatProductSymbolic_MPIAIJBACKEND(Mat C)
7098 {
7099   Mat_Product           *product = C->product;
7100   Mat                    A, P, mp[MAX_NUMBER_INTERMEDIATE]; /* A, P and a series of intermediate matrices */
7101   Mat_MPIAIJ            *a, *p;
7102   MatMatMPIAIJBACKEND   *mmdata;
7103   ISLocalToGlobalMapping P_oth_l2g = NULL;
7104   IS                     glob      = NULL;
7105   const char            *prefix;
7106   char                   pprefix[256];
7107   const PetscInt        *globidx, *P_oth_idx;
7108   PetscInt               i, j, cp, m, n, M, N, *coo_i, *coo_j;
7109   PetscCount             ncoo, ncoo_d, ncoo_o, ncoo_oown;
7110   PetscInt               cmapt[MAX_NUMBER_INTERMEDIATE], rmapt[MAX_NUMBER_INTERMEDIATE]; /* col/row map type for each Mat in mp[]. */
7111                                                                                          /* type-0: consecutive, start from 0; type-1: consecutive with */
7112                                                                                          /* a base offset; type-2: sparse with a local to global map table */
7113   const PetscInt *cmapa[MAX_NUMBER_INTERMEDIATE], *rmapa[MAX_NUMBER_INTERMEDIATE];       /* col/row local to global map array (table) for type-2 map type */
7114 
7115   MatProductType ptype;
7116   PetscBool      mptmp[MAX_NUMBER_INTERMEDIATE], hasoffproc = PETSC_FALSE, iscuda, iship, iskokk;
7117   PetscMPIInt    size;
7118 
7119   PetscFunctionBegin;
7120   MatCheckProduct(C, 1);
7121   PetscCheck(!product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty");
7122   ptype = product->type;
7123   if (product->A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
7124     ptype                                          = MATPRODUCT_AB;
7125     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
7126   }
7127   switch (ptype) {
7128   case MATPRODUCT_AB:
7129     A          = product->A;
7130     P          = product->B;
7131     m          = A->rmap->n;
7132     n          = P->cmap->n;
7133     M          = A->rmap->N;
7134     N          = P->cmap->N;
7135     hasoffproc = PETSC_FALSE; /* will not scatter mat product values to other processes */
7136     break;
7137   case MATPRODUCT_AtB:
7138     P          = product->A;
7139     A          = product->B;
7140     m          = P->cmap->n;
7141     n          = A->cmap->n;
7142     M          = P->cmap->N;
7143     N          = A->cmap->N;
7144     hasoffproc = PETSC_TRUE;
7145     break;
7146   case MATPRODUCT_PtAP:
7147     A          = product->A;
7148     P          = product->B;
7149     m          = P->cmap->n;
7150     n          = P->cmap->n;
7151     M          = P->cmap->N;
7152     N          = P->cmap->N;
7153     hasoffproc = PETSC_TRUE;
7154     break;
7155   default:
7156     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
7157   }
7158   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)C), &size));
7159   if (size == 1) hasoffproc = PETSC_FALSE;
7160 
7161   /* defaults */
7162   for (i = 0; i < MAX_NUMBER_INTERMEDIATE; i++) {
7163     mp[i]    = NULL;
7164     mptmp[i] = PETSC_FALSE;
7165     rmapt[i] = -1;
7166     cmapt[i] = -1;
7167     rmapa[i] = NULL;
7168     cmapa[i] = NULL;
7169   }
7170 
7171   /* customization */
7172   PetscCall(PetscNew(&mmdata));
7173   mmdata->reusesym = product->api_user;
7174   if (ptype == MATPRODUCT_AB) {
7175     if (product->api_user) {
7176       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
7177       PetscCall(PetscOptionsBool("-matmatmult_backend_mergeB", "Merge product->B local matrices", "MatMatMult", mmdata->abmerge, &mmdata->abmerge, NULL));
7178       PetscCall(PetscOptionsBool("-matmatmult_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7179       PetscOptionsEnd();
7180     } else {
7181       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
7182       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_mergeB", "Merge product->B local matrices", "MatMatMult", mmdata->abmerge, &mmdata->abmerge, NULL));
7183       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7184       PetscOptionsEnd();
7185     }
7186   } else if (ptype == MATPRODUCT_PtAP) {
7187     if (product->api_user) {
7188       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
7189       PetscCall(PetscOptionsBool("-matptap_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7190       PetscOptionsEnd();
7191     } else {
7192       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
7193       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7194       PetscOptionsEnd();
7195     }
7196   }
7197   a = (Mat_MPIAIJ *)A->data;
7198   p = (Mat_MPIAIJ *)P->data;
7199   PetscCall(MatSetSizes(C, m, n, M, N));
7200   PetscCall(PetscLayoutSetUp(C->rmap));
7201   PetscCall(PetscLayoutSetUp(C->cmap));
7202   PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
7203   PetscCall(MatGetOptionsPrefix(C, &prefix));
7204 
7205   cp = 0;
7206   switch (ptype) {
7207   case MATPRODUCT_AB: /* A * P */
7208     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7209 
7210     /* A_diag * P_local (merged or not) */
7211     if (mmdata->abmerge) { /* P's diagonal and off-diag blocks are merged to one matrix, then multiplied by A_diag */
7212       /* P is product->B */
7213       PetscCall(MatMPIAIJGetLocalMatMerge(P, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7214       PetscCall(MatProductCreate(a->A, mmdata->Bloc, NULL, &mp[cp]));
7215       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7216       PetscCall(MatProductSetFill(mp[cp], product->fill));
7217       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7218       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7219       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7220       mp[cp]->product->api_user = product->api_user;
7221       PetscCall(MatProductSetFromOptions(mp[cp]));
7222       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7223       PetscCall(ISGetIndices(glob, &globidx));
7224       rmapt[cp] = 1;
7225       cmapt[cp] = 2;
7226       cmapa[cp] = globidx;
7227       mptmp[cp] = PETSC_FALSE;
7228       cp++;
7229     } else { /* A_diag * P_diag and A_diag * P_off */
7230       PetscCall(MatProductCreate(a->A, p->A, NULL, &mp[cp]));
7231       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7232       PetscCall(MatProductSetFill(mp[cp], product->fill));
7233       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7234       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7235       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7236       mp[cp]->product->api_user = product->api_user;
7237       PetscCall(MatProductSetFromOptions(mp[cp]));
7238       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7239       rmapt[cp] = 1;
7240       cmapt[cp] = 1;
7241       mptmp[cp] = PETSC_FALSE;
7242       cp++;
7243       PetscCall(MatProductCreate(a->A, p->B, NULL, &mp[cp]));
7244       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7245       PetscCall(MatProductSetFill(mp[cp], product->fill));
7246       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7247       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7248       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7249       mp[cp]->product->api_user = product->api_user;
7250       PetscCall(MatProductSetFromOptions(mp[cp]));
7251       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7252       rmapt[cp] = 1;
7253       cmapt[cp] = 2;
7254       cmapa[cp] = p->garray;
7255       mptmp[cp] = PETSC_FALSE;
7256       cp++;
7257     }
7258 
7259     /* A_off * P_other */
7260     if (mmdata->P_oth) {
7261       PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth, &P_oth_l2g)); /* make P_oth use local col ids */
7262       PetscCall(ISLocalToGlobalMappingGetIndices(P_oth_l2g, &P_oth_idx));
7263       PetscCall(MatSetType(mmdata->P_oth, ((PetscObject)a->B)->type_name));
7264       PetscCall(MatBindToCPU(mmdata->P_oth, mmdata->P_oth_bind));
7265       PetscCall(MatProductCreate(a->B, mmdata->P_oth, NULL, &mp[cp]));
7266       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7267       PetscCall(MatProductSetFill(mp[cp], product->fill));
7268       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7269       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7270       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7271       mp[cp]->product->api_user = product->api_user;
7272       PetscCall(MatProductSetFromOptions(mp[cp]));
7273       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7274       rmapt[cp] = 1;
7275       cmapt[cp] = 2;
7276       cmapa[cp] = P_oth_idx;
7277       mptmp[cp] = PETSC_FALSE;
7278       cp++;
7279     }
7280     break;
7281 
7282   case MATPRODUCT_AtB: /* (P^t * A): P_diag * A_loc + P_off * A_loc */
7283     /* A is product->B */
7284     PetscCall(MatMPIAIJGetLocalMatMerge(A, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7285     if (A == P) { /* when A==P, we can take advantage of the already merged mmdata->Bloc */
7286       PetscCall(MatProductCreate(mmdata->Bloc, mmdata->Bloc, NULL, &mp[cp]));
7287       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7288       PetscCall(MatProductSetFill(mp[cp], product->fill));
7289       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7290       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7291       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7292       mp[cp]->product->api_user = product->api_user;
7293       PetscCall(MatProductSetFromOptions(mp[cp]));
7294       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7295       PetscCall(ISGetIndices(glob, &globidx));
7296       rmapt[cp] = 2;
7297       rmapa[cp] = globidx;
7298       cmapt[cp] = 2;
7299       cmapa[cp] = globidx;
7300       mptmp[cp] = PETSC_FALSE;
7301       cp++;
7302     } else {
7303       PetscCall(MatProductCreate(p->A, mmdata->Bloc, NULL, &mp[cp]));
7304       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7305       PetscCall(MatProductSetFill(mp[cp], product->fill));
7306       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7307       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7308       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7309       mp[cp]->product->api_user = product->api_user;
7310       PetscCall(MatProductSetFromOptions(mp[cp]));
7311       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7312       PetscCall(ISGetIndices(glob, &globidx));
7313       rmapt[cp] = 1;
7314       cmapt[cp] = 2;
7315       cmapa[cp] = globidx;
7316       mptmp[cp] = PETSC_FALSE;
7317       cp++;
7318       PetscCall(MatProductCreate(p->B, mmdata->Bloc, NULL, &mp[cp]));
7319       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7320       PetscCall(MatProductSetFill(mp[cp], product->fill));
7321       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7322       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7323       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7324       mp[cp]->product->api_user = product->api_user;
7325       PetscCall(MatProductSetFromOptions(mp[cp]));
7326       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7327       rmapt[cp] = 2;
7328       rmapa[cp] = p->garray;
7329       cmapt[cp] = 2;
7330       cmapa[cp] = globidx;
7331       mptmp[cp] = PETSC_FALSE;
7332       cp++;
7333     }
7334     break;
7335   case MATPRODUCT_PtAP:
7336     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7337     /* P is product->B */
7338     PetscCall(MatMPIAIJGetLocalMatMerge(P, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7339     PetscCall(MatProductCreate(a->A, mmdata->Bloc, NULL, &mp[cp]));
7340     PetscCall(MatProductSetType(mp[cp], MATPRODUCT_PtAP));
7341     PetscCall(MatProductSetFill(mp[cp], product->fill));
7342     PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7343     PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7344     PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7345     mp[cp]->product->api_user = product->api_user;
7346     PetscCall(MatProductSetFromOptions(mp[cp]));
7347     PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7348     PetscCall(ISGetIndices(glob, &globidx));
7349     rmapt[cp] = 2;
7350     rmapa[cp] = globidx;
7351     cmapt[cp] = 2;
7352     cmapa[cp] = globidx;
7353     mptmp[cp] = PETSC_FALSE;
7354     cp++;
7355     if (mmdata->P_oth) {
7356       PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth, &P_oth_l2g));
7357       PetscCall(ISLocalToGlobalMappingGetIndices(P_oth_l2g, &P_oth_idx));
7358       PetscCall(MatSetType(mmdata->P_oth, ((PetscObject)a->B)->type_name));
7359       PetscCall(MatBindToCPU(mmdata->P_oth, mmdata->P_oth_bind));
7360       PetscCall(MatProductCreate(a->B, mmdata->P_oth, NULL, &mp[cp]));
7361       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7362       PetscCall(MatProductSetFill(mp[cp], product->fill));
7363       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7364       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7365       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7366       mp[cp]->product->api_user = product->api_user;
7367       PetscCall(MatProductSetFromOptions(mp[cp]));
7368       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7369       mptmp[cp] = PETSC_TRUE;
7370       cp++;
7371       PetscCall(MatProductCreate(mmdata->Bloc, mp[1], NULL, &mp[cp]));
7372       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7373       PetscCall(MatProductSetFill(mp[cp], product->fill));
7374       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7375       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7376       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7377       mp[cp]->product->api_user = product->api_user;
7378       PetscCall(MatProductSetFromOptions(mp[cp]));
7379       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7380       rmapt[cp] = 2;
7381       rmapa[cp] = globidx;
7382       cmapt[cp] = 2;
7383       cmapa[cp] = P_oth_idx;
7384       mptmp[cp] = PETSC_FALSE;
7385       cp++;
7386     }
7387     break;
7388   default:
7389     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
7390   }
7391   /* sanity check */
7392   if (size > 1)
7393     for (i = 0; i < cp; i++) PetscCheck(rmapt[i] != 2 || hasoffproc, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected offproc map type for product %" PetscInt_FMT, i);
7394 
7395   PetscCall(PetscMalloc2(cp, &mmdata->mp, cp, &mmdata->mptmp));
7396   for (i = 0; i < cp; i++) {
7397     mmdata->mp[i]    = mp[i];
7398     mmdata->mptmp[i] = mptmp[i];
7399   }
7400   mmdata->cp             = cp;
7401   C->product->data       = mmdata;
7402   C->product->destroy    = MatDestroy_MatMatMPIAIJBACKEND;
7403   C->ops->productnumeric = MatProductNumeric_MPIAIJBACKEND;
7404 
7405   /* memory type */
7406   mmdata->mtype = PETSC_MEMTYPE_HOST;
7407   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iscuda, MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, ""));
7408   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iship, MATSEQAIJHIPSPARSE, MATMPIAIJHIPSPARSE, ""));
7409   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iskokk, MATSEQAIJKOKKOS, MATMPIAIJKOKKOS, ""));
7410   if (iscuda) mmdata->mtype = PETSC_MEMTYPE_CUDA;
7411   else if (iship) mmdata->mtype = PETSC_MEMTYPE_HIP;
7412   else if (iskokk) mmdata->mtype = PETSC_MEMTYPE_KOKKOS;
7413 
7414   /* prepare coo coordinates for values insertion */
7415 
7416   /* count total nonzeros of those intermediate seqaij Mats
7417     ncoo_d:    # of nonzeros of matrices that do not have offproc entries
7418     ncoo_o:    # of nonzeros (of matrices that might have offproc entries) that will be inserted to remote procs
7419     ncoo_oown: # of nonzeros (of matrices that might have offproc entries) that will be inserted locally
7420   */
7421   for (cp = 0, ncoo_d = 0, ncoo_o = 0, ncoo_oown = 0; cp < mmdata->cp; cp++) {
7422     Mat_SeqAIJ *mm = (Mat_SeqAIJ *)mp[cp]->data;
7423     if (mptmp[cp]) continue;
7424     if (rmapt[cp] == 2 && hasoffproc) { /* the rows need to be scatter to all processes (might include self) */
7425       const PetscInt *rmap = rmapa[cp];
7426       const PetscInt  mr   = mp[cp]->rmap->n;
7427       const PetscInt  rs   = C->rmap->rstart;
7428       const PetscInt  re   = C->rmap->rend;
7429       const PetscInt *ii   = mm->i;
7430       for (i = 0; i < mr; i++) {
7431         const PetscInt gr = rmap[i];
7432         const PetscInt nz = ii[i + 1] - ii[i];
7433         if (gr < rs || gr >= re) ncoo_o += nz; /* this row is offproc */
7434         else ncoo_oown += nz;                  /* this row is local */
7435       }
7436     } else ncoo_d += mm->nz;
7437   }
7438 
7439   /*
7440     ncoo: total number of nonzeros (including those inserted by remote procs) belonging to this proc
7441 
7442     ncoo = ncoo_d + ncoo_oown + ncoo2, which ncoo2 is number of nonzeros inserted to me by other procs.
7443 
7444     off[0] points to a big index array, which is shared by off[1,2,...]. Similarly, for own[0].
7445 
7446     off[p]: points to the segment for matrix mp[p], storing location of nonzeros that mp[p] will insert to others
7447     own[p]: points to the segment for matrix mp[p], storing location of nonzeros that mp[p] will insert locally
7448     so, off[p+1]-off[p] is the number of nonzeros that mp[p] will send to others.
7449 
7450     coo_i/j/v[]: [ncoo] row/col/val of nonzeros belonging to this proc.
7451     Ex. coo_i[]: the beginning part (of size ncoo_d + ncoo_oown) stores i of local nonzeros, and the remaining part stores i of nonzeros I will receive.
7452   */
7453   PetscCall(PetscCalloc1(mmdata->cp + 1, &mmdata->off)); /* +1 to make a csr-like data structure */
7454   PetscCall(PetscCalloc1(mmdata->cp + 1, &mmdata->own));
7455 
7456   /* gather (i,j) of nonzeros inserted by remote procs */
7457   if (hasoffproc) {
7458     PetscSF  msf;
7459     PetscInt ncoo2, *coo_i2, *coo_j2;
7460 
7461     PetscCall(PetscMalloc1(ncoo_o, &mmdata->off[0]));
7462     PetscCall(PetscMalloc1(ncoo_oown, &mmdata->own[0]));
7463     PetscCall(PetscMalloc2(ncoo_o, &coo_i, ncoo_o, &coo_j)); /* to collect (i,j) of entries to be sent to others */
7464 
7465     for (cp = 0, ncoo_o = 0; cp < mmdata->cp; cp++) {
7466       Mat_SeqAIJ *mm     = (Mat_SeqAIJ *)mp[cp]->data;
7467       PetscInt   *idxoff = mmdata->off[cp];
7468       PetscInt   *idxown = mmdata->own[cp];
7469       if (!mptmp[cp] && rmapt[cp] == 2) { /* row map is sparse */
7470         const PetscInt *rmap = rmapa[cp];
7471         const PetscInt *cmap = cmapa[cp];
7472         const PetscInt *ii   = mm->i;
7473         PetscInt       *coi  = coo_i + ncoo_o;
7474         PetscInt       *coj  = coo_j + ncoo_o;
7475         const PetscInt  mr   = mp[cp]->rmap->n;
7476         const PetscInt  rs   = C->rmap->rstart;
7477         const PetscInt  re   = C->rmap->rend;
7478         const PetscInt  cs   = C->cmap->rstart;
7479         for (i = 0; i < mr; i++) {
7480           const PetscInt *jj = mm->j + ii[i];
7481           const PetscInt  gr = rmap[i];
7482           const PetscInt  nz = ii[i + 1] - ii[i];
7483           if (gr < rs || gr >= re) { /* this is an offproc row */
7484             for (j = ii[i]; j < ii[i + 1]; j++) {
7485               *coi++    = gr;
7486               *idxoff++ = j;
7487             }
7488             if (!cmapt[cp]) { /* already global */
7489               for (j = 0; j < nz; j++) *coj++ = jj[j];
7490             } else if (cmapt[cp] == 1) { /* local to global for owned columns of C */
7491               for (j = 0; j < nz; j++) *coj++ = jj[j] + cs;
7492             } else { /* offdiag */
7493               for (j = 0; j < nz; j++) *coj++ = cmap[jj[j]];
7494             }
7495             ncoo_o += nz;
7496           } else { /* this is a local row */
7497             for (j = ii[i]; j < ii[i + 1]; j++) *idxown++ = j;
7498           }
7499         }
7500       }
7501       mmdata->off[cp + 1] = idxoff;
7502       mmdata->own[cp + 1] = idxown;
7503     }
7504 
7505     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)C), &mmdata->sf));
7506     PetscInt incoo_o;
7507     PetscCall(PetscIntCast(ncoo_o, &incoo_o));
7508     PetscCall(PetscSFSetGraphLayout(mmdata->sf, C->rmap, incoo_o /*nleaves*/, NULL /*ilocal*/, PETSC_OWN_POINTER, coo_i));
7509     PetscCall(PetscSFGetMultiSF(mmdata->sf, &msf));
7510     PetscCall(PetscSFGetGraph(msf, &ncoo2 /*nroots*/, NULL, NULL, NULL));
7511     ncoo = ncoo_d + ncoo_oown + ncoo2;
7512     PetscCall(PetscMalloc2(ncoo, &coo_i2, ncoo, &coo_j2));
7513     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_INT, coo_i, coo_i2 + ncoo_d + ncoo_oown)); /* put (i,j) of remote nonzeros at back */
7514     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_INT, coo_i, coo_i2 + ncoo_d + ncoo_oown));
7515     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_INT, coo_j, coo_j2 + ncoo_d + ncoo_oown));
7516     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_INT, coo_j, coo_j2 + ncoo_d + ncoo_oown));
7517     PetscCall(PetscFree2(coo_i, coo_j));
7518     /* allocate MPI send buffer to collect nonzero values to be sent to remote procs */
7519     PetscCall(PetscSFMalloc(mmdata->sf, mmdata->mtype, ncoo_o * sizeof(PetscScalar), (void **)&mmdata->coo_w));
7520     coo_i = coo_i2;
7521     coo_j = coo_j2;
7522   } else { /* no offproc values insertion */
7523     ncoo = ncoo_d;
7524     PetscCall(PetscMalloc2(ncoo, &coo_i, ncoo, &coo_j));
7525 
7526     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)C), &mmdata->sf));
7527     PetscCall(PetscSFSetGraph(mmdata->sf, 0, 0, NULL, PETSC_OWN_POINTER, NULL, PETSC_OWN_POINTER));
7528     PetscCall(PetscSFSetUp(mmdata->sf));
7529   }
7530   mmdata->hasoffproc = hasoffproc;
7531 
7532   /* gather (i,j) of nonzeros inserted locally */
7533   for (cp = 0, ncoo_d = 0; cp < mmdata->cp; cp++) {
7534     Mat_SeqAIJ     *mm   = (Mat_SeqAIJ *)mp[cp]->data;
7535     PetscInt       *coi  = coo_i + ncoo_d;
7536     PetscInt       *coj  = coo_j + ncoo_d;
7537     const PetscInt *jj   = mm->j;
7538     const PetscInt *ii   = mm->i;
7539     const PetscInt *cmap = cmapa[cp];
7540     const PetscInt *rmap = rmapa[cp];
7541     const PetscInt  mr   = mp[cp]->rmap->n;
7542     const PetscInt  rs   = C->rmap->rstart;
7543     const PetscInt  re   = C->rmap->rend;
7544     const PetscInt  cs   = C->cmap->rstart;
7545 
7546     if (mptmp[cp]) continue;
7547     if (rmapt[cp] == 1) { /* consecutive rows */
7548       /* fill coo_i */
7549       for (i = 0; i < mr; i++) {
7550         const PetscInt gr = i + rs;
7551         for (j = ii[i]; j < ii[i + 1]; j++) coi[j] = gr;
7552       }
7553       /* fill coo_j */
7554       if (!cmapt[cp]) { /* type-0, already global */
7555         PetscCall(PetscArraycpy(coj, jj, mm->nz));
7556       } else if (cmapt[cp] == 1) {                        /* type-1, local to global for consecutive columns of C */
7557         for (j = 0; j < mm->nz; j++) coj[j] = jj[j] + cs; /* lid + col start */
7558       } else {                                            /* type-2, local to global for sparse columns */
7559         for (j = 0; j < mm->nz; j++) coj[j] = cmap[jj[j]];
7560       }
7561       ncoo_d += mm->nz;
7562     } else if (rmapt[cp] == 2) { /* sparse rows */
7563       for (i = 0; i < mr; i++) {
7564         const PetscInt *jj = mm->j + ii[i];
7565         const PetscInt  gr = rmap[i];
7566         const PetscInt  nz = ii[i + 1] - ii[i];
7567         if (gr >= rs && gr < re) { /* local rows */
7568           for (j = ii[i]; j < ii[i + 1]; j++) *coi++ = gr;
7569           if (!cmapt[cp]) { /* type-0, already global */
7570             for (j = 0; j < nz; j++) *coj++ = jj[j];
7571           } else if (cmapt[cp] == 1) { /* local to global for owned columns of C */
7572             for (j = 0; j < nz; j++) *coj++ = jj[j] + cs;
7573           } else { /* type-2, local to global for sparse columns */
7574             for (j = 0; j < nz; j++) *coj++ = cmap[jj[j]];
7575           }
7576           ncoo_d += nz;
7577         }
7578       }
7579     }
7580   }
7581   if (glob) PetscCall(ISRestoreIndices(glob, &globidx));
7582   PetscCall(ISDestroy(&glob));
7583   if (P_oth_l2g) PetscCall(ISLocalToGlobalMappingRestoreIndices(P_oth_l2g, &P_oth_idx));
7584   PetscCall(ISLocalToGlobalMappingDestroy(&P_oth_l2g));
7585   /* allocate an array to store all nonzeros (inserted locally or remotely) belonging to this proc */
7586   PetscCall(PetscSFMalloc(mmdata->sf, mmdata->mtype, ncoo * sizeof(PetscScalar), (void **)&mmdata->coo_v));
7587 
7588   /* set block sizes */
7589   A = product->A;
7590   P = product->B;
7591   switch (ptype) {
7592   case MATPRODUCT_PtAP:
7593     PetscCall(MatSetBlockSizes(C, P->cmap->bs, P->cmap->bs));
7594     break;
7595   case MATPRODUCT_RARt:
7596     PetscCall(MatSetBlockSizes(C, P->rmap->bs, P->rmap->bs));
7597     break;
7598   case MATPRODUCT_ABC:
7599     PetscCall(MatSetBlockSizesFromMats(C, A, product->C));
7600     break;
7601   case MATPRODUCT_AB:
7602     PetscCall(MatSetBlockSizesFromMats(C, A, P));
7603     break;
7604   case MATPRODUCT_AtB:
7605     PetscCall(MatSetBlockSizes(C, A->cmap->bs, P->cmap->bs));
7606     break;
7607   case MATPRODUCT_ABt:
7608     PetscCall(MatSetBlockSizes(C, A->rmap->bs, P->rmap->bs));
7609     break;
7610   default:
7611     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for ProductType %s", MatProductTypes[ptype]);
7612   }
7613 
7614   /* preallocate with COO data */
7615   PetscCall(MatSetPreallocationCOO(C, ncoo, coo_i, coo_j));
7616   PetscCall(PetscFree2(coo_i, coo_j));
7617   PetscFunctionReturn(PETSC_SUCCESS);
7618 }
7619 
7620 PetscErrorCode MatProductSetFromOptions_MPIAIJBACKEND(Mat mat)
7621 {
7622   Mat_Product *product = mat->product;
7623 #if defined(PETSC_HAVE_DEVICE)
7624   PetscBool match  = PETSC_FALSE;
7625   PetscBool usecpu = PETSC_FALSE;
7626 #else
7627   PetscBool match = PETSC_TRUE;
7628 #endif
7629 
7630   PetscFunctionBegin;
7631   MatCheckProduct(mat, 1);
7632 #if defined(PETSC_HAVE_DEVICE)
7633   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, ((PetscObject)product->A)->type_name, &match));
7634   if (match) { /* we can always fallback to the CPU if requested */
7635     switch (product->type) {
7636     case MATPRODUCT_AB:
7637       if (product->api_user) {
7638         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
7639         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
7640         PetscOptionsEnd();
7641       } else {
7642         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
7643         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
7644         PetscOptionsEnd();
7645       }
7646       break;
7647     case MATPRODUCT_AtB:
7648       if (product->api_user) {
7649         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
7650         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
7651         PetscOptionsEnd();
7652       } else {
7653         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
7654         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
7655         PetscOptionsEnd();
7656       }
7657       break;
7658     case MATPRODUCT_PtAP:
7659       if (product->api_user) {
7660         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
7661         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
7662         PetscOptionsEnd();
7663       } else {
7664         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
7665         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
7666         PetscOptionsEnd();
7667       }
7668       break;
7669     default:
7670       break;
7671     }
7672     match = (PetscBool)!usecpu;
7673   }
7674 #endif
7675   if (match) {
7676     switch (product->type) {
7677     case MATPRODUCT_AB:
7678     case MATPRODUCT_AtB:
7679     case MATPRODUCT_PtAP:
7680       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJBACKEND;
7681       break;
7682     default:
7683       break;
7684     }
7685   }
7686   /* fallback to MPIAIJ ops */
7687   if (!mat->ops->productsymbolic) PetscCall(MatProductSetFromOptions_MPIAIJ(mat));
7688   PetscFunctionReturn(PETSC_SUCCESS);
7689 }
7690 
7691 /*
7692    Produces a set of block column indices of the matrix row, one for each block represented in the original row
7693 
7694    n - the number of block indices in cc[]
7695    cc - the block indices (must be large enough to contain the indices)
7696 */
7697 static inline PetscErrorCode MatCollapseRow(Mat Amat, PetscInt row, PetscInt bs, PetscInt *n, PetscInt *cc)
7698 {
7699   PetscInt        cnt = -1, nidx, j;
7700   const PetscInt *idx;
7701 
7702   PetscFunctionBegin;
7703   PetscCall(MatGetRow(Amat, row, &nidx, &idx, NULL));
7704   if (nidx) {
7705     cnt     = 0;
7706     cc[cnt] = idx[0] / bs;
7707     for (j = 1; j < nidx; j++) {
7708       if (cc[cnt] < idx[j] / bs) cc[++cnt] = idx[j] / bs;
7709     }
7710   }
7711   PetscCall(MatRestoreRow(Amat, row, &nidx, &idx, NULL));
7712   *n = cnt + 1;
7713   PetscFunctionReturn(PETSC_SUCCESS);
7714 }
7715 
7716 /*
7717     Produces a set of block column indices of the matrix block row, one for each block represented in the original set of rows
7718 
7719     ncollapsed - the number of block indices
7720     collapsed - the block indices (must be large enough to contain the indices)
7721 */
7722 static inline PetscErrorCode MatCollapseRows(Mat Amat, PetscInt start, PetscInt bs, PetscInt *w0, PetscInt *w1, PetscInt *w2, PetscInt *ncollapsed, PetscInt **collapsed)
7723 {
7724   PetscInt i, nprev, *cprev = w0, ncur = 0, *ccur = w1, *merged = w2, *cprevtmp;
7725 
7726   PetscFunctionBegin;
7727   PetscCall(MatCollapseRow(Amat, start, bs, &nprev, cprev));
7728   for (i = start + 1; i < start + bs; i++) {
7729     PetscCall(MatCollapseRow(Amat, i, bs, &ncur, ccur));
7730     PetscCall(PetscMergeIntArray(nprev, cprev, ncur, ccur, &nprev, &merged));
7731     cprevtmp = cprev;
7732     cprev    = merged;
7733     merged   = cprevtmp;
7734   }
7735   *ncollapsed = nprev;
7736   if (collapsed) *collapsed = cprev;
7737   PetscFunctionReturn(PETSC_SUCCESS);
7738 }
7739 
7740 /*
7741  MatCreateGraph_Simple_AIJ - create simple scalar matrix (graph) from potentially blocked matrix
7742 
7743  Input Parameter:
7744  . Amat - matrix
7745  - symmetrize - make the result symmetric
7746  + scale - scale with diagonal
7747 
7748  Output Parameter:
7749  . a_Gmat - output scalar graph >= 0
7750 
7751 */
7752 PETSC_INTERN PetscErrorCode MatCreateGraph_Simple_AIJ(Mat Amat, PetscBool symmetrize, PetscBool scale, PetscReal filter, PetscInt index_size, PetscInt index[], Mat *a_Gmat)
7753 {
7754   PetscInt  Istart, Iend, Ii, jj, kk, ncols, nloc, NN, MM, bs;
7755   MPI_Comm  comm;
7756   Mat       Gmat;
7757   PetscBool ismpiaij, isseqaij;
7758   Mat       a, b, c;
7759   MatType   jtype;
7760 
7761   PetscFunctionBegin;
7762   PetscCall(PetscObjectGetComm((PetscObject)Amat, &comm));
7763   PetscCall(MatGetOwnershipRange(Amat, &Istart, &Iend));
7764   PetscCall(MatGetSize(Amat, &MM, &NN));
7765   PetscCall(MatGetBlockSize(Amat, &bs));
7766   nloc = (Iend - Istart) / bs;
7767 
7768   PetscCall(PetscObjectBaseTypeCompare((PetscObject)Amat, MATSEQAIJ, &isseqaij));
7769   PetscCall(PetscObjectBaseTypeCompare((PetscObject)Amat, MATMPIAIJ, &ismpiaij));
7770   PetscCheck(isseqaij || ismpiaij, comm, PETSC_ERR_USER, "Require (MPI)AIJ matrix type");
7771 
7772   /* TODO GPU: these calls are potentially expensive if matrices are large and we want to use the GPU */
7773   /* A solution consists in providing a new API, MatAIJGetCollapsedAIJ, and each class can provide a fast
7774      implementation */
7775   if (bs > 1) {
7776     PetscCall(MatGetType(Amat, &jtype));
7777     PetscCall(MatCreate(comm, &Gmat));
7778     PetscCall(MatSetType(Gmat, jtype));
7779     PetscCall(MatSetSizes(Gmat, nloc, nloc, PETSC_DETERMINE, PETSC_DETERMINE));
7780     PetscCall(MatSetBlockSizes(Gmat, 1, 1));
7781     if (isseqaij || ((Mat_MPIAIJ *)Amat->data)->garray) {
7782       PetscInt  *d_nnz, *o_nnz;
7783       MatScalar *aa, val, *AA;
7784       PetscInt  *aj, *ai, *AJ, nc, nmax = 0;
7785 
7786       if (isseqaij) {
7787         a = Amat;
7788         b = NULL;
7789       } else {
7790         Mat_MPIAIJ *d = (Mat_MPIAIJ *)Amat->data;
7791         a             = d->A;
7792         b             = d->B;
7793       }
7794       PetscCall(PetscInfo(Amat, "New bs>1 Graph. nloc=%" PetscInt_FMT "\n", nloc));
7795       PetscCall(PetscMalloc2(nloc, &d_nnz, (isseqaij ? 0 : nloc), &o_nnz));
7796       for (c = a, kk = 0; c && kk < 2; c = b, kk++) {
7797         PetscInt       *nnz = (c == a) ? d_nnz : o_nnz;
7798         const PetscInt *cols1, *cols2;
7799 
7800         for (PetscInt brow = 0, nc1, nc2, ok = 1; brow < nloc * bs; brow += bs) { // block rows
7801           PetscCall(MatGetRow(c, brow, &nc2, &cols2, NULL));
7802           nnz[brow / bs] = nc2 / bs;
7803           if (nc2 % bs) ok = 0;
7804           if (nnz[brow / bs] > nmax) nmax = nnz[brow / bs];
7805           for (PetscInt ii = 1; ii < bs; ii++) { // check for non-dense blocks
7806             PetscCall(MatGetRow(c, brow + ii, &nc1, &cols1, NULL));
7807             if (nc1 != nc2) ok = 0;
7808             else {
7809               for (PetscInt jj = 0; jj < nc1 && ok == 1; jj++) {
7810                 if (cols1[jj] != cols2[jj]) ok = 0;
7811                 if (cols1[jj] % bs != jj % bs) ok = 0;
7812               }
7813             }
7814             PetscCall(MatRestoreRow(c, brow + ii, &nc1, &cols1, NULL));
7815           }
7816           PetscCall(MatRestoreRow(c, brow, &nc2, &cols2, NULL));
7817           if (!ok) {
7818             PetscCall(PetscFree2(d_nnz, o_nnz));
7819             PetscCall(PetscInfo(Amat, "Found sparse blocks - revert to slow method\n"));
7820             goto old_bs;
7821           }
7822         }
7823       }
7824       PetscCall(MatSeqAIJSetPreallocation(Gmat, 0, d_nnz));
7825       PetscCall(MatMPIAIJSetPreallocation(Gmat, 0, d_nnz, 0, o_nnz));
7826       PetscCall(PetscFree2(d_nnz, o_nnz));
7827       PetscCall(PetscMalloc2(nmax, &AA, nmax, &AJ));
7828       // diag
7829       for (PetscInt brow = 0, n, grow; brow < nloc * bs; brow += bs) { // block rows
7830         Mat_SeqAIJ *aseq = (Mat_SeqAIJ *)a->data;
7831 
7832         ai = aseq->i;
7833         n  = ai[brow + 1] - ai[brow];
7834         aj = aseq->j + ai[brow];
7835         for (PetscInt k = 0; k < n; k += bs) {   // block columns
7836           AJ[k / bs] = aj[k] / bs + Istart / bs; // diag starts at (Istart,Istart)
7837           val        = 0;
7838           if (index_size == 0) {
7839             for (PetscInt ii = 0; ii < bs; ii++) { // rows in block
7840               aa = aseq->a + ai[brow + ii] + k;
7841               for (PetscInt jj = 0; jj < bs; jj++) {    // columns in block
7842                 val += PetscAbs(PetscRealPart(aa[jj])); // a sort of norm
7843               }
7844             }
7845           } else {                                            // use (index,index) value if provided
7846             for (PetscInt iii = 0; iii < index_size; iii++) { // rows in block
7847               PetscInt ii = index[iii];
7848               aa          = aseq->a + ai[brow + ii] + k;
7849               for (PetscInt jjj = 0; jjj < index_size; jjj++) { // columns in block
7850                 PetscInt jj = index[jjj];
7851                 val += PetscAbs(PetscRealPart(aa[jj]));
7852               }
7853             }
7854           }
7855           PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs (%" PetscInt_FMT ") >= nmax (%" PetscInt_FMT ")", k / bs, nmax);
7856           AA[k / bs] = val;
7857         }
7858         grow = Istart / bs + brow / bs;
7859         PetscCall(MatSetValues(Gmat, 1, &grow, n / bs, AJ, AA, ADD_VALUES));
7860       }
7861       // off-diag
7862       if (ismpiaij) {
7863         Mat_MPIAIJ        *aij = (Mat_MPIAIJ *)Amat->data;
7864         const PetscScalar *vals;
7865         const PetscInt    *cols, *garray = aij->garray;
7866 
7867         PetscCheck(garray, PETSC_COMM_SELF, PETSC_ERR_USER, "No garray ?");
7868         for (PetscInt brow = 0, grow; brow < nloc * bs; brow += bs) { // block rows
7869           PetscCall(MatGetRow(b, brow, &ncols, &cols, NULL));
7870           for (PetscInt k = 0, cidx = 0; k < ncols; k += bs, cidx++) {
7871             PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs >= nmax");
7872             AA[k / bs] = 0;
7873             AJ[cidx]   = garray[cols[k]] / bs;
7874           }
7875           nc = ncols / bs;
7876           PetscCall(MatRestoreRow(b, brow, &ncols, &cols, NULL));
7877           if (index_size == 0) {
7878             for (PetscInt ii = 0; ii < bs; ii++) { // rows in block
7879               PetscCall(MatGetRow(b, brow + ii, &ncols, &cols, &vals));
7880               for (PetscInt k = 0; k < ncols; k += bs) {
7881                 for (PetscInt jj = 0; jj < bs; jj++) { // cols in block
7882                   PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs (%" PetscInt_FMT ") >= nmax (%" PetscInt_FMT ")", k / bs, nmax);
7883                   AA[k / bs] += PetscAbs(PetscRealPart(vals[k + jj]));
7884                 }
7885               }
7886               PetscCall(MatRestoreRow(b, brow + ii, &ncols, &cols, &vals));
7887             }
7888           } else {                                            // use (index,index) value if provided
7889             for (PetscInt iii = 0; iii < index_size; iii++) { // rows in block
7890               PetscInt ii = index[iii];
7891               PetscCall(MatGetRow(b, brow + ii, &ncols, &cols, &vals));
7892               for (PetscInt k = 0; k < ncols; k += bs) {
7893                 for (PetscInt jjj = 0; jjj < index_size; jjj++) { // cols in block
7894                   PetscInt jj = index[jjj];
7895                   AA[k / bs] += PetscAbs(PetscRealPart(vals[k + jj]));
7896                 }
7897               }
7898               PetscCall(MatRestoreRow(b, brow + ii, &ncols, &cols, &vals));
7899             }
7900           }
7901           grow = Istart / bs + brow / bs;
7902           PetscCall(MatSetValues(Gmat, 1, &grow, nc, AJ, AA, ADD_VALUES));
7903         }
7904       }
7905       PetscCall(MatAssemblyBegin(Gmat, MAT_FINAL_ASSEMBLY));
7906       PetscCall(MatAssemblyEnd(Gmat, MAT_FINAL_ASSEMBLY));
7907       PetscCall(PetscFree2(AA, AJ));
7908     } else {
7909       const PetscScalar *vals;
7910       const PetscInt    *idx;
7911       PetscInt          *d_nnz, *o_nnz, *w0, *w1, *w2;
7912     old_bs:
7913       /*
7914        Determine the preallocation needed for the scalar matrix derived from the vector matrix.
7915        */
7916       PetscCall(PetscInfo(Amat, "OLD bs>1 CreateGraph\n"));
7917       PetscCall(PetscMalloc2(nloc, &d_nnz, (isseqaij ? 0 : nloc), &o_nnz));
7918       if (isseqaij) {
7919         PetscInt max_d_nnz;
7920 
7921         /*
7922          Determine exact preallocation count for (sequential) scalar matrix
7923          */
7924         PetscCall(MatSeqAIJGetMaxRowNonzeros(Amat, &max_d_nnz));
7925         max_d_nnz = PetscMin(nloc, bs * max_d_nnz);
7926         PetscCall(PetscMalloc3(max_d_nnz, &w0, max_d_nnz, &w1, max_d_nnz, &w2));
7927         for (Ii = 0, jj = 0; Ii < Iend; Ii += bs, jj++) PetscCall(MatCollapseRows(Amat, Ii, bs, w0, w1, w2, &d_nnz[jj], NULL));
7928         PetscCall(PetscFree3(w0, w1, w2));
7929       } else if (ismpiaij) {
7930         Mat             Daij, Oaij;
7931         const PetscInt *garray;
7932         PetscInt        max_d_nnz;
7933 
7934         PetscCall(MatMPIAIJGetSeqAIJ(Amat, &Daij, &Oaij, &garray));
7935         /*
7936          Determine exact preallocation count for diagonal block portion of scalar matrix
7937          */
7938         PetscCall(MatSeqAIJGetMaxRowNonzeros(Daij, &max_d_nnz));
7939         max_d_nnz = PetscMin(nloc, bs * max_d_nnz);
7940         PetscCall(PetscMalloc3(max_d_nnz, &w0, max_d_nnz, &w1, max_d_nnz, &w2));
7941         for (Ii = 0, jj = 0; Ii < Iend - Istart; Ii += bs, jj++) PetscCall(MatCollapseRows(Daij, Ii, bs, w0, w1, w2, &d_nnz[jj], NULL));
7942         PetscCall(PetscFree3(w0, w1, w2));
7943         /*
7944          Over estimate (usually grossly over), preallocation count for off-diagonal portion of scalar matrix
7945          */
7946         for (Ii = 0, jj = 0; Ii < Iend - Istart; Ii += bs, jj++) {
7947           o_nnz[jj] = 0;
7948           for (kk = 0; kk < bs; kk++) { /* rows that get collapsed to a single row */
7949             PetscCall(MatGetRow(Oaij, Ii + kk, &ncols, NULL, NULL));
7950             o_nnz[jj] += ncols;
7951             PetscCall(MatRestoreRow(Oaij, Ii + kk, &ncols, NULL, NULL));
7952           }
7953           if (o_nnz[jj] > (NN / bs - nloc)) o_nnz[jj] = NN / bs - nloc;
7954         }
7955       } else SETERRQ(comm, PETSC_ERR_USER, "Require AIJ matrix type");
7956       /* get scalar copy (norms) of matrix */
7957       PetscCall(MatSeqAIJSetPreallocation(Gmat, 0, d_nnz));
7958       PetscCall(MatMPIAIJSetPreallocation(Gmat, 0, d_nnz, 0, o_nnz));
7959       PetscCall(PetscFree2(d_nnz, o_nnz));
7960       for (Ii = Istart; Ii < Iend; Ii++) {
7961         PetscInt dest_row = Ii / bs;
7962 
7963         PetscCall(MatGetRow(Amat, Ii, &ncols, &idx, &vals));
7964         for (jj = 0; jj < ncols; jj++) {
7965           PetscInt    dest_col = idx[jj] / bs;
7966           PetscScalar sv       = PetscAbs(PetscRealPart(vals[jj]));
7967 
7968           PetscCall(MatSetValues(Gmat, 1, &dest_row, 1, &dest_col, &sv, ADD_VALUES));
7969         }
7970         PetscCall(MatRestoreRow(Amat, Ii, &ncols, &idx, &vals));
7971       }
7972       PetscCall(MatAssemblyBegin(Gmat, MAT_FINAL_ASSEMBLY));
7973       PetscCall(MatAssemblyEnd(Gmat, MAT_FINAL_ASSEMBLY));
7974     }
7975   } else {
7976     if (symmetrize || filter >= 0 || scale) PetscCall(MatDuplicate(Amat, MAT_COPY_VALUES, &Gmat));
7977     else {
7978       Gmat = Amat;
7979       PetscCall(PetscObjectReference((PetscObject)Gmat));
7980     }
7981     if (isseqaij) {
7982       a = Gmat;
7983       b = NULL;
7984     } else {
7985       Mat_MPIAIJ *d = (Mat_MPIAIJ *)Gmat->data;
7986       a             = d->A;
7987       b             = d->B;
7988     }
7989     if (filter >= 0 || scale) {
7990       /* take absolute value of each entry */
7991       for (c = a, kk = 0; c && kk < 2; c = b, kk++) {
7992         MatInfo      info;
7993         PetscScalar *avals;
7994 
7995         PetscCall(MatGetInfo(c, MAT_LOCAL, &info));
7996         PetscCall(MatSeqAIJGetArray(c, &avals));
7997         for (int jj = 0; jj < info.nz_used; jj++) avals[jj] = PetscAbsScalar(avals[jj]);
7998         PetscCall(MatSeqAIJRestoreArray(c, &avals));
7999       }
8000     }
8001   }
8002   if (symmetrize) {
8003     PetscBool isset, issym;
8004 
8005     PetscCall(MatIsSymmetricKnown(Amat, &isset, &issym));
8006     if (!isset || !issym) {
8007       Mat matTrans;
8008 
8009       PetscCall(MatTranspose(Gmat, MAT_INITIAL_MATRIX, &matTrans));
8010       PetscCall(MatAXPY(Gmat, 1.0, matTrans, Gmat->structurally_symmetric == PETSC_BOOL3_TRUE ? SAME_NONZERO_PATTERN : DIFFERENT_NONZERO_PATTERN));
8011       PetscCall(MatDestroy(&matTrans));
8012     }
8013     PetscCall(MatSetOption(Gmat, MAT_SYMMETRIC, PETSC_TRUE));
8014   } else if (Amat != Gmat) PetscCall(MatPropagateSymmetryOptions(Amat, Gmat));
8015   if (scale) {
8016     /* scale c for all diagonal values = 1 or -1 */
8017     Vec diag;
8018 
8019     PetscCall(MatCreateVecs(Gmat, &diag, NULL));
8020     PetscCall(MatGetDiagonal(Gmat, diag));
8021     PetscCall(VecReciprocal(diag));
8022     PetscCall(VecSqrtAbs(diag));
8023     PetscCall(MatDiagonalScale(Gmat, diag, diag));
8024     PetscCall(VecDestroy(&diag));
8025   }
8026   PetscCall(MatViewFromOptions(Gmat, NULL, "-mat_graph_view"));
8027   if (filter >= 0) {
8028     PetscCall(MatFilter(Gmat, filter, PETSC_TRUE, PETSC_TRUE));
8029     PetscCall(MatViewFromOptions(Gmat, NULL, "-mat_filter_graph_view"));
8030   }
8031   *a_Gmat = Gmat;
8032   PetscFunctionReturn(PETSC_SUCCESS);
8033 }
8034 
8035 PETSC_INTERN PetscErrorCode MatGetCurrentMemType_MPIAIJ(Mat A, PetscMemType *memtype)
8036 {
8037   Mat_MPIAIJ  *mpiaij = (Mat_MPIAIJ *)A->data;
8038   PetscMemType mD = PETSC_MEMTYPE_HOST, mO = PETSC_MEMTYPE_HOST;
8039 
8040   PetscFunctionBegin;
8041   if (mpiaij->A) PetscCall(MatGetCurrentMemType(mpiaij->A, &mD));
8042   if (mpiaij->B) PetscCall(MatGetCurrentMemType(mpiaij->B, &mO));
8043   *memtype = (mD == mO) ? mD : PETSC_MEMTYPE_HOST;
8044   PetscFunctionReturn(PETSC_SUCCESS);
8045 }
8046 
8047 /*
8048     Special version for direct calls from Fortran
8049 */
8050 
8051 /* Change these macros so can be used in void function */
8052 /* Identical to PetscCallVoid, except it assigns to *_ierr */
8053 #undef PetscCall
8054 #define PetscCall(...) \
8055   do { \
8056     PetscErrorCode ierr_msv_mpiaij = __VA_ARGS__; \
8057     if (PetscUnlikely(ierr_msv_mpiaij)) { \
8058       *_ierr = PetscError(PETSC_COMM_SELF, __LINE__, PETSC_FUNCTION_NAME, __FILE__, ierr_msv_mpiaij, PETSC_ERROR_REPEAT, " "); \
8059       return; \
8060     } \
8061   } while (0)
8062 
8063 #undef SETERRQ
8064 #define SETERRQ(comm, ierr, ...) \
8065   do { \
8066     *_ierr = PetscError(comm, __LINE__, PETSC_FUNCTION_NAME, __FILE__, ierr, PETSC_ERROR_INITIAL, __VA_ARGS__); \
8067     return; \
8068   } while (0)
8069 
8070 #if defined(PETSC_HAVE_FORTRAN_CAPS)
8071   #define matsetvaluesmpiaij_ MATSETVALUESMPIAIJ
8072 #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
8073   #define matsetvaluesmpiaij_ matsetvaluesmpiaij
8074 #else
8075 #endif
8076 PETSC_EXTERN void matsetvaluesmpiaij_(Mat *mmat, PetscInt *mm, const PetscInt im[], PetscInt *mn, const PetscInt in[], const PetscScalar v[], InsertMode *maddv, PetscErrorCode *_ierr)
8077 {
8078   Mat         mat = *mmat;
8079   PetscInt    m = *mm, n = *mn;
8080   InsertMode  addv = *maddv;
8081   Mat_MPIAIJ *aij  = (Mat_MPIAIJ *)mat->data;
8082   PetscScalar value;
8083 
8084   MatCheckPreallocated(mat, 1);
8085   if (mat->insertmode == NOT_SET_VALUES) mat->insertmode = addv;
8086   else PetscCheck(mat->insertmode == addv, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Cannot mix add values and insert values");
8087   {
8088     PetscInt  i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
8089     PetscInt  cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
8090     PetscBool roworiented = aij->roworiented;
8091 
8092     /* Some Variables required in the macro */
8093     Mat         A     = aij->A;
8094     Mat_SeqAIJ *a     = (Mat_SeqAIJ *)A->data;
8095     PetscInt   *aimax = a->imax, *ai = a->i, *ailen = a->ilen, *aj = a->j;
8096     MatScalar  *aa;
8097     PetscBool   ignorezeroentries = ((a->ignorezeroentries && (addv == ADD_VALUES)) ? PETSC_TRUE : PETSC_FALSE);
8098     Mat         B                 = aij->B;
8099     Mat_SeqAIJ *b                 = (Mat_SeqAIJ *)B->data;
8100     PetscInt   *bimax = b->imax, *bi = b->i, *bilen = b->ilen, *bj = b->j, bm = aij->B->rmap->n, am = aij->A->rmap->n;
8101     MatScalar  *ba;
8102     /* This variable below is only for the PETSC_HAVE_VIENNACL or PETSC_HAVE_CUDA cases, but we define it in all cases because we
8103      * cannot use "#if defined" inside a macro. */
8104     PETSC_UNUSED PetscBool inserted = PETSC_FALSE;
8105 
8106     PetscInt  *rp1, *rp2, ii, nrow1, nrow2, _i, rmax1, rmax2, N, low1, high1, low2, high2, t, lastcol1, lastcol2;
8107     PetscInt   nonew = a->nonew;
8108     MatScalar *ap1, *ap2;
8109 
8110     PetscFunctionBegin;
8111     PetscCall(MatSeqAIJGetArray(A, &aa));
8112     PetscCall(MatSeqAIJGetArray(B, &ba));
8113     for (i = 0; i < m; i++) {
8114       if (im[i] < 0) continue;
8115       PetscCheck(im[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, im[i], mat->rmap->N - 1);
8116       if (im[i] >= rstart && im[i] < rend) {
8117         row      = im[i] - rstart;
8118         lastcol1 = -1;
8119         rp1      = aj + ai[row];
8120         ap1      = aa + ai[row];
8121         rmax1    = aimax[row];
8122         nrow1    = ailen[row];
8123         low1     = 0;
8124         high1    = nrow1;
8125         lastcol2 = -1;
8126         rp2      = bj + bi[row];
8127         ap2      = ba + bi[row];
8128         rmax2    = bimax[row];
8129         nrow2    = bilen[row];
8130         low2     = 0;
8131         high2    = nrow2;
8132 
8133         for (j = 0; j < n; j++) {
8134           if (roworiented) value = v[i * n + j];
8135           else value = v[i + j * m];
8136           if (ignorezeroentries && value == 0.0 && (addv == ADD_VALUES) && im[i] != in[j]) continue;
8137           if (in[j] >= cstart && in[j] < cend) {
8138             col = in[j] - cstart;
8139             MatSetValues_SeqAIJ_A_Private(row, col, value, addv, im[i], in[j]);
8140           } else if (in[j] < 0) continue;
8141           else if (PetscUnlikelyDebug(in[j] >= mat->cmap->N)) {
8142             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[j], mat->cmap->N - 1);
8143           } else {
8144             if (mat->was_assembled) {
8145               if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
8146 #if defined(PETSC_USE_CTABLE)
8147               PetscCall(PetscHMapIGetWithDefault(aij->colmap, in[j] + 1, 0, &col));
8148               col--;
8149 #else
8150               col = aij->colmap[in[j]] - 1;
8151 #endif
8152               if (col < 0 && !((Mat_SeqAIJ *)aij->A->data)->nonew) {
8153                 PetscCall(MatDisAssemble_MPIAIJ(mat, PETSC_FALSE));
8154                 col = in[j];
8155                 /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
8156                 B        = aij->B;
8157                 b        = (Mat_SeqAIJ *)B->data;
8158                 bimax    = b->imax;
8159                 bi       = b->i;
8160                 bilen    = b->ilen;
8161                 bj       = b->j;
8162                 rp2      = bj + bi[row];
8163                 ap2      = ba + bi[row];
8164                 rmax2    = bimax[row];
8165                 nrow2    = bilen[row];
8166                 low2     = 0;
8167                 high2    = nrow2;
8168                 bm       = aij->B->rmap->n;
8169                 ba       = b->a;
8170                 inserted = PETSC_FALSE;
8171               }
8172             } else col = in[j];
8173             MatSetValues_SeqAIJ_B_Private(row, col, value, addv, im[i], in[j]);
8174           }
8175         }
8176       } else if (!aij->donotstash) {
8177         if (roworiented) {
8178           PetscCall(MatStashValuesRow_Private(&mat->stash, im[i], n, in, v + i * n, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
8179         } else {
8180           PetscCall(MatStashValuesCol_Private(&mat->stash, im[i], n, in, v + i, m, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
8181         }
8182       }
8183     }
8184     PetscCall(MatSeqAIJRestoreArray(A, &aa));
8185     PetscCall(MatSeqAIJRestoreArray(B, &ba));
8186   }
8187   PetscFunctionReturnVoid();
8188 }
8189 
8190 /* Undefining these here since they were redefined from their original definition above! No
8191  * other PETSc functions should be defined past this point, as it is impossible to recover the
8192  * original definitions */
8193 #undef PetscCall
8194 #undef SETERRQ
8195