xref: /petsc/src/mat/impls/aij/mpi/mpiaij.c (revision 76f14e82e4ef5dcf684231ad11e66de8b15385f6)
1 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
2 #include <petsc/private/vecimpl.h>
3 #include <petsc/private/sfimpl.h>
4 #include <petsc/private/isimpl.h>
5 #include <petscblaslapack.h>
6 #include <petscsf.h>
7 #include <petsc/private/hashmapi.h>
8 
9 /* defines MatSetValues_MPI_Hash(), MatAssemblyBegin_MPI_Hash(), and MatAssemblyEnd_MPI_Hash() */
10 #define TYPE AIJ
11 #define TYPE_AIJ
12 #include "../src/mat/impls/aij/mpi/mpihashmat.h"
13 #undef TYPE
14 #undef TYPE_AIJ
15 
16 static PetscErrorCode MatReset_MPIAIJ(Mat mat)
17 {
18   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
19 
20   PetscFunctionBegin;
21   PetscCall(PetscLogObjectState((PetscObject)mat, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT, mat->rmap->N, mat->cmap->N));
22   PetscCall(MatStashDestroy_Private(&mat->stash));
23   PetscCall(VecDestroy(&aij->diag));
24   PetscCall(MatDestroy(&aij->A));
25   PetscCall(MatDestroy(&aij->B));
26 #if defined(PETSC_USE_CTABLE)
27   PetscCall(PetscHMapIDestroy(&aij->colmap));
28 #else
29   PetscCall(PetscFree(aij->colmap));
30 #endif
31   PetscCall(PetscFree(aij->garray));
32   PetscCall(VecDestroy(&aij->lvec));
33   PetscCall(VecScatterDestroy(&aij->Mvctx));
34   PetscCall(PetscFree2(aij->rowvalues, aij->rowindices));
35   PetscCall(PetscFree(aij->ld));
36   PetscFunctionReturn(PETSC_SUCCESS);
37 }
38 
39 static PetscErrorCode MatResetHash_MPIAIJ(Mat mat)
40 {
41   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
42   /* Save the nonzero states of the component matrices because those are what are used to determine
43     the nonzero state of mat */
44   PetscObjectState Astate = aij->A->nonzerostate, Bstate = aij->B->nonzerostate;
45 
46   PetscFunctionBegin;
47   PetscCall(MatReset_MPIAIJ(mat));
48   PetscCall(MatSetUp_MPI_Hash(mat));
49   aij->A->nonzerostate = ++Astate, aij->B->nonzerostate = ++Bstate;
50   PetscFunctionReturn(PETSC_SUCCESS);
51 }
52 
53 PetscErrorCode MatDestroy_MPIAIJ(Mat mat)
54 {
55   PetscFunctionBegin;
56   PetscCall(MatReset_MPIAIJ(mat));
57 
58   PetscCall(PetscFree(mat->data));
59 
60   /* may be created by MatCreateMPIAIJSumSeqAIJSymbolic */
61   PetscCall(PetscObjectCompose((PetscObject)mat, "MatMergeSeqsToMPI", NULL));
62 
63   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, NULL));
64   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatStoreValues_C", NULL));
65   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatRetrieveValues_C", NULL));
66   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatIsTranspose_C", NULL));
67   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetPreallocation_C", NULL));
68   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatResetPreallocation_C", NULL));
69   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatResetHash_C", NULL));
70   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetPreallocationCSR_C", NULL));
71   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatDiagonalScaleLocal_C", NULL));
72   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpibaij_C", NULL));
73   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpisbaij_C", NULL));
74 #if defined(PETSC_HAVE_CUDA)
75   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijcusparse_C", NULL));
76 #endif
77 #if defined(PETSC_HAVE_HIP)
78   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijhipsparse_C", NULL));
79 #endif
80 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
81   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijkokkos_C", NULL));
82 #endif
83   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpidense_C", NULL));
84 #if defined(PETSC_HAVE_ELEMENTAL)
85   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_elemental_C", NULL));
86 #endif
87 #if defined(PETSC_HAVE_SCALAPACK)
88   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_scalapack_C", NULL));
89 #endif
90 #if defined(PETSC_HAVE_HYPRE)
91   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_hypre_C", NULL));
92   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_transpose_mpiaij_mpiaij_C", NULL));
93 #endif
94   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_is_C", NULL));
95   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_is_mpiaij_C", NULL));
96   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatProductSetFromOptions_mpiaij_mpiaij_C", NULL));
97   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatMPIAIJSetUseScalableIncreaseOverlap_C", NULL));
98   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijperm_C", NULL));
99   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijsell_C", NULL));
100 #if defined(PETSC_HAVE_MKL_SPARSE)
101   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijmkl_C", NULL));
102 #endif
103   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpiaijcrl_C", NULL));
104   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_is_C", NULL));
105   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatConvert_mpiaij_mpisell_C", NULL));
106   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatSetPreallocationCOO_C", NULL));
107   PetscCall(PetscObjectComposeFunction((PetscObject)mat, "MatSetValuesCOO_C", NULL));
108   PetscFunctionReturn(PETSC_SUCCESS);
109 }
110 
111 static PetscErrorCode MatGetRowIJ_MPIAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *m, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
112 {
113   Mat B;
114 
115   PetscFunctionBegin;
116   PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, &B));
117   PetscCall(PetscObjectCompose((PetscObject)A, "MatGetRowIJ_MPIAIJ", (PetscObject)B));
118   PetscCall(MatGetRowIJ(B, oshift, symmetric, inodecompressed, m, ia, ja, done));
119   PetscCall(MatDestroy(&B));
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 static PetscErrorCode MatRestoreRowIJ_MPIAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *m, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
124 {
125   Mat B;
126 
127   PetscFunctionBegin;
128   PetscCall(PetscObjectQuery((PetscObject)A, "MatGetRowIJ_MPIAIJ", (PetscObject *)&B));
129   PetscCall(MatRestoreRowIJ(B, oshift, symmetric, inodecompressed, m, ia, ja, done));
130   PetscCall(PetscObjectCompose((PetscObject)A, "MatGetRowIJ_MPIAIJ", NULL));
131   PetscFunctionReturn(PETSC_SUCCESS);
132 }
133 
134 /*MC
135    MATAIJ - MATAIJ = "aij" - A matrix type to be used for sparse matrices.
136 
137    This matrix type is identical to` MATSEQAIJ` when constructed with a single process communicator,
138    and `MATMPIAIJ` otherwise.  As a result, for single process communicators,
139   `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
140   for communicators controlling multiple processes.  It is recommended that you call both of
141   the above preallocation routines for simplicity.
142 
143    Options Database Key:
144 . -mat_type aij - sets the matrix type to `MATAIJ` during a call to `MatSetFromOptions()`
145 
146   Developer Note:
147   Level: beginner
148 
149     Subclasses include `MATAIJCUSPARSE`, `MATAIJPERM`, `MATAIJSELL`, `MATAIJMKL`, `MATAIJCRL`, `MATAIJKOKKOS`,and also automatically switches over to use inodes when
150    enough exist.
151 
152 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MATSEQAIJ`, `MatCreateAIJ()`, `MatCreateSeqAIJ()`, `MATSEQAIJ`, `MATMPIAIJ`
153 M*/
154 
155 /*MC
156    MATAIJCRL - MATAIJCRL = "aijcrl" - A matrix type to be used for sparse matrices.
157 
158    This matrix type is identical to `MATSEQAIJCRL` when constructed with a single process communicator,
159    and `MATMPIAIJCRL` otherwise.  As a result, for single process communicators,
160    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
161   for communicators controlling multiple processes.  It is recommended that you call both of
162   the above preallocation routines for simplicity.
163 
164    Options Database Key:
165 . -mat_type aijcrl - sets the matrix type to `MATMPIAIJCRL` during a call to `MatSetFromOptions()`
166 
167   Level: beginner
168 
169 .seealso: [](ch_matrices), `Mat`, `MatCreateMPIAIJCRL`, `MATSEQAIJCRL`, `MATMPIAIJCRL`, `MATSEQAIJCRL`, `MATMPIAIJCRL`
170 M*/
171 
172 static PetscErrorCode MatBindToCPU_MPIAIJ(Mat A, PetscBool flg)
173 {
174   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
175 
176   PetscFunctionBegin;
177 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP) || defined(PETSC_HAVE_VIENNACL)
178   A->boundtocpu = flg;
179 #endif
180   if (a->A) PetscCall(MatBindToCPU(a->A, flg));
181   if (a->B) PetscCall(MatBindToCPU(a->B, flg));
182 
183   /* In addition to binding the diagonal and off-diagonal matrices, bind the local vectors used for matrix-vector products.
184    * This maybe seems a little odd for a MatBindToCPU() call to do, but it makes no sense for the binding of these vectors
185    * to differ from the parent matrix. */
186   if (a->lvec) PetscCall(VecBindToCPU(a->lvec, flg));
187   if (a->diag) PetscCall(VecBindToCPU(a->diag, flg));
188   PetscFunctionReturn(PETSC_SUCCESS);
189 }
190 
191 static PetscErrorCode MatSetBlockSizes_MPIAIJ(Mat M, PetscInt rbs, PetscInt cbs)
192 {
193   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)M->data;
194 
195   PetscFunctionBegin;
196   if (mat->A) {
197     PetscCall(MatSetBlockSizes(mat->A, rbs, cbs));
198     PetscCall(MatSetBlockSizes(mat->B, rbs, 1));
199   }
200   PetscFunctionReturn(PETSC_SUCCESS);
201 }
202 
203 static PetscErrorCode MatFindNonzeroRows_MPIAIJ(Mat M, IS *keptrows)
204 {
205   Mat_MPIAIJ      *mat = (Mat_MPIAIJ *)M->data;
206   Mat_SeqAIJ      *a   = (Mat_SeqAIJ *)mat->A->data;
207   Mat_SeqAIJ      *b   = (Mat_SeqAIJ *)mat->B->data;
208   const PetscInt  *ia, *ib;
209   const MatScalar *aa, *bb, *aav, *bav;
210   PetscInt         na, nb, i, j, *rows, cnt = 0, n0rows;
211   PetscInt         m = M->rmap->n, rstart = M->rmap->rstart;
212 
213   PetscFunctionBegin;
214   *keptrows = NULL;
215 
216   ia = a->i;
217   ib = b->i;
218   PetscCall(MatSeqAIJGetArrayRead(mat->A, &aav));
219   PetscCall(MatSeqAIJGetArrayRead(mat->B, &bav));
220   for (i = 0; i < m; i++) {
221     na = ia[i + 1] - ia[i];
222     nb = ib[i + 1] - ib[i];
223     if (!na && !nb) {
224       cnt++;
225       goto ok1;
226     }
227     aa = aav + ia[i];
228     for (j = 0; j < na; j++) {
229       if (aa[j] != 0.0) goto ok1;
230     }
231     bb = PetscSafePointerPlusOffset(bav, ib[i]);
232     for (j = 0; j < nb; j++) {
233       if (bb[j] != 0.0) goto ok1;
234     }
235     cnt++;
236   ok1:;
237   }
238   PetscCallMPI(MPIU_Allreduce(&cnt, &n0rows, 1, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)M)));
239   if (!n0rows) {
240     PetscCall(MatSeqAIJRestoreArrayRead(mat->A, &aav));
241     PetscCall(MatSeqAIJRestoreArrayRead(mat->B, &bav));
242     PetscFunctionReturn(PETSC_SUCCESS);
243   }
244   PetscCall(PetscMalloc1(M->rmap->n - cnt, &rows));
245   cnt = 0;
246   for (i = 0; i < m; i++) {
247     na = ia[i + 1] - ia[i];
248     nb = ib[i + 1] - ib[i];
249     if (!na && !nb) continue;
250     aa = aav + ia[i];
251     for (j = 0; j < na; j++) {
252       if (aa[j] != 0.0) {
253         rows[cnt++] = rstart + i;
254         goto ok2;
255       }
256     }
257     bb = PetscSafePointerPlusOffset(bav, ib[i]);
258     for (j = 0; j < nb; j++) {
259       if (bb[j] != 0.0) {
260         rows[cnt++] = rstart + i;
261         goto ok2;
262       }
263     }
264   ok2:;
265   }
266   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)M), cnt, rows, PETSC_OWN_POINTER, keptrows));
267   PetscCall(MatSeqAIJRestoreArrayRead(mat->A, &aav));
268   PetscCall(MatSeqAIJRestoreArrayRead(mat->B, &bav));
269   PetscFunctionReturn(PETSC_SUCCESS);
270 }
271 
272 static PetscErrorCode MatDiagonalSet_MPIAIJ(Mat Y, Vec D, InsertMode is)
273 {
274   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)Y->data;
275   PetscBool   cong;
276 
277   PetscFunctionBegin;
278   PetscCall(MatHasCongruentLayouts(Y, &cong));
279   if (Y->assembled && cong) {
280     PetscCall(MatDiagonalSet(aij->A, D, is));
281   } else {
282     PetscCall(MatDiagonalSet_Default(Y, D, is));
283   }
284   PetscFunctionReturn(PETSC_SUCCESS);
285 }
286 
287 static PetscErrorCode MatFindZeroDiagonals_MPIAIJ(Mat M, IS *zrows)
288 {
289   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)M->data;
290   PetscInt    i, rstart, nrows, *rows;
291 
292   PetscFunctionBegin;
293   *zrows = NULL;
294   PetscCall(MatFindZeroDiagonals_SeqAIJ_Private(aij->A, &nrows, &rows));
295   PetscCall(MatGetOwnershipRange(M, &rstart, NULL));
296   for (i = 0; i < nrows; i++) rows[i] += rstart;
297   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)M), nrows, rows, PETSC_OWN_POINTER, zrows));
298   PetscFunctionReturn(PETSC_SUCCESS);
299 }
300 
301 static PetscErrorCode MatGetColumnReductions_MPIAIJ(Mat A, PetscInt type, PetscReal *reductions)
302 {
303   Mat_MPIAIJ        *aij = (Mat_MPIAIJ *)A->data;
304   PetscInt           i, m, n, *garray = aij->garray;
305   Mat_SeqAIJ        *a_aij = (Mat_SeqAIJ *)aij->A->data;
306   Mat_SeqAIJ        *b_aij = (Mat_SeqAIJ *)aij->B->data;
307   PetscReal         *work;
308   const PetscScalar *dummy;
309 
310   PetscFunctionBegin;
311   PetscCall(MatGetSize(A, &m, &n));
312   PetscCall(PetscCalloc1(n, &work));
313   PetscCall(MatSeqAIJGetArrayRead(aij->A, &dummy));
314   PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &dummy));
315   PetscCall(MatSeqAIJGetArrayRead(aij->B, &dummy));
316   PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &dummy));
317   if (type == NORM_2) {
318     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscAbsScalar(a_aij->a[i] * a_aij->a[i]);
319     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscAbsScalar(b_aij->a[i] * b_aij->a[i]);
320   } else if (type == NORM_1) {
321     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscAbsScalar(a_aij->a[i]);
322     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscAbsScalar(b_aij->a[i]);
323   } else if (type == NORM_INFINITY) {
324     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] = PetscMax(PetscAbsScalar(a_aij->a[i]), work[A->cmap->rstart + a_aij->j[i]]);
325     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] = PetscMax(PetscAbsScalar(b_aij->a[i]), work[garray[b_aij->j[i]]]);
326   } else if (type == REDUCTION_SUM_REALPART || type == REDUCTION_MEAN_REALPART) {
327     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscRealPart(a_aij->a[i]);
328     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscRealPart(b_aij->a[i]);
329   } else if (type == REDUCTION_SUM_IMAGINARYPART || type == REDUCTION_MEAN_IMAGINARYPART) {
330     for (i = 0; i < a_aij->i[aij->A->rmap->n]; i++) work[A->cmap->rstart + a_aij->j[i]] += PetscImaginaryPart(a_aij->a[i]);
331     for (i = 0; i < b_aij->i[aij->B->rmap->n]; i++) work[garray[b_aij->j[i]]] += PetscImaginaryPart(b_aij->a[i]);
332   } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unknown reduction type");
333   if (type == NORM_INFINITY) {
334     PetscCallMPI(MPIU_Allreduce(work, reductions, n, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)A)));
335   } else {
336     PetscCallMPI(MPIU_Allreduce(work, reductions, n, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)A)));
337   }
338   PetscCall(PetscFree(work));
339   if (type == NORM_2) {
340     for (i = 0; i < n; i++) reductions[i] = PetscSqrtReal(reductions[i]);
341   } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) {
342     for (i = 0; i < n; i++) reductions[i] /= m;
343   }
344   PetscFunctionReturn(PETSC_SUCCESS);
345 }
346 
347 static PetscErrorCode MatFindOffBlockDiagonalEntries_MPIAIJ(Mat A, IS *is)
348 {
349   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)A->data;
350   IS              sis, gis;
351   const PetscInt *isis, *igis;
352   PetscInt        n, *iis, nsis, ngis, rstart, i;
353 
354   PetscFunctionBegin;
355   PetscCall(MatFindOffBlockDiagonalEntries(a->A, &sis));
356   PetscCall(MatFindNonzeroRows(a->B, &gis));
357   PetscCall(ISGetSize(gis, &ngis));
358   PetscCall(ISGetSize(sis, &nsis));
359   PetscCall(ISGetIndices(sis, &isis));
360   PetscCall(ISGetIndices(gis, &igis));
361 
362   PetscCall(PetscMalloc1(ngis + nsis, &iis));
363   PetscCall(PetscArraycpy(iis, igis, ngis));
364   PetscCall(PetscArraycpy(iis + ngis, isis, nsis));
365   n = ngis + nsis;
366   PetscCall(PetscSortRemoveDupsInt(&n, iis));
367   PetscCall(MatGetOwnershipRange(A, &rstart, NULL));
368   for (i = 0; i < n; i++) iis[i] += rstart;
369   PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)A), n, iis, PETSC_OWN_POINTER, is));
370 
371   PetscCall(ISRestoreIndices(sis, &isis));
372   PetscCall(ISRestoreIndices(gis, &igis));
373   PetscCall(ISDestroy(&sis));
374   PetscCall(ISDestroy(&gis));
375   PetscFunctionReturn(PETSC_SUCCESS);
376 }
377 
378 /*
379   Local utility routine that creates a mapping from the global column
380 number to the local number in the off-diagonal part of the local
381 storage of the matrix.  When PETSC_USE_CTABLE is used this is scalable at
382 a slightly higher hash table cost; without it it is not scalable (each processor
383 has an order N integer array but is fast to access.
384 */
385 PetscErrorCode MatCreateColmap_MPIAIJ_Private(Mat mat)
386 {
387   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
388   PetscInt    n   = aij->B->cmap->n, i;
389 
390   PetscFunctionBegin;
391   PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
392 #if defined(PETSC_USE_CTABLE)
393   PetscCall(PetscHMapICreateWithSize(n, &aij->colmap));
394   for (i = 0; i < n; i++) PetscCall(PetscHMapISet(aij->colmap, aij->garray[i] + 1, i + 1));
395 #else
396   PetscCall(PetscCalloc1(mat->cmap->N + 1, &aij->colmap));
397   for (i = 0; i < n; i++) aij->colmap[aij->garray[i]] = i + 1;
398 #endif
399   PetscFunctionReturn(PETSC_SUCCESS);
400 }
401 
402 #define MatSetValues_SeqAIJ_A_Private(row, col, value, addv, orow, ocol) \
403   do { \
404     if (col <= lastcol1) low1 = 0; \
405     else high1 = nrow1; \
406     lastcol1 = col; \
407     while (high1 - low1 > 5) { \
408       t = (low1 + high1) / 2; \
409       if (rp1[t] > col) high1 = t; \
410       else low1 = t; \
411     } \
412     for (_i = low1; _i < high1; _i++) { \
413       if (rp1[_i] > col) break; \
414       if (rp1[_i] == col) { \
415         if (addv == ADD_VALUES) { \
416           ap1[_i] += value; \
417           /* Not sure LogFlops will slow dow the code or not */ \
418           (void)PetscLogFlops(1.0); \
419         } else ap1[_i] = value; \
420         goto a_noinsert; \
421       } \
422     } \
423     if (value == 0.0 && ignorezeroentries && row != col) { \
424       low1  = 0; \
425       high1 = nrow1; \
426       goto a_noinsert; \
427     } \
428     if (nonew == 1) { \
429       low1  = 0; \
430       high1 = nrow1; \
431       goto a_noinsert; \
432     } \
433     PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
434     MatSeqXAIJReallocateAIJ(A, am, 1, nrow1, row, col, rmax1, aa, ai, aj, rp1, ap1, aimax, nonew, MatScalar); \
435     N = nrow1++ - 1; \
436     a->nz++; \
437     high1++; \
438     /* shift up all the later entries in this row */ \
439     PetscCall(PetscArraymove(rp1 + _i + 1, rp1 + _i, N - _i + 1)); \
440     PetscCall(PetscArraymove(ap1 + _i + 1, ap1 + _i, N - _i + 1)); \
441     rp1[_i] = col; \
442     ap1[_i] = value; \
443   a_noinsert:; \
444     ailen[row] = nrow1; \
445   } while (0)
446 
447 #define MatSetValues_SeqAIJ_B_Private(row, col, value, addv, orow, ocol) \
448   do { \
449     if (col <= lastcol2) low2 = 0; \
450     else high2 = nrow2; \
451     lastcol2 = col; \
452     while (high2 - low2 > 5) { \
453       t = (low2 + high2) / 2; \
454       if (rp2[t] > col) high2 = t; \
455       else low2 = t; \
456     } \
457     for (_i = low2; _i < high2; _i++) { \
458       if (rp2[_i] > col) break; \
459       if (rp2[_i] == col) { \
460         if (addv == ADD_VALUES) { \
461           ap2[_i] += value; \
462           (void)PetscLogFlops(1.0); \
463         } else ap2[_i] = value; \
464         goto b_noinsert; \
465       } \
466     } \
467     if (value == 0.0 && ignorezeroentries) { \
468       low2  = 0; \
469       high2 = nrow2; \
470       goto b_noinsert; \
471     } \
472     if (nonew == 1) { \
473       low2  = 0; \
474       high2 = nrow2; \
475       goto b_noinsert; \
476     } \
477     PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \
478     MatSeqXAIJReallocateAIJ(B, bm, 1, nrow2, row, col, rmax2, ba, bi, bj, rp2, ap2, bimax, nonew, MatScalar); \
479     N = nrow2++ - 1; \
480     b->nz++; \
481     high2++; \
482     /* shift up all the later entries in this row */ \
483     PetscCall(PetscArraymove(rp2 + _i + 1, rp2 + _i, N - _i + 1)); \
484     PetscCall(PetscArraymove(ap2 + _i + 1, ap2 + _i, N - _i + 1)); \
485     rp2[_i] = col; \
486     ap2[_i] = value; \
487   b_noinsert:; \
488     bilen[row] = nrow2; \
489   } while (0)
490 
491 static PetscErrorCode MatSetValuesRow_MPIAIJ(Mat A, PetscInt row, const PetscScalar v[])
492 {
493   Mat_MPIAIJ  *mat = (Mat_MPIAIJ *)A->data;
494   Mat_SeqAIJ  *a = (Mat_SeqAIJ *)mat->A->data, *b = (Mat_SeqAIJ *)mat->B->data;
495   PetscInt     l, *garray                         = mat->garray, diag;
496   PetscScalar *aa, *ba;
497 
498   PetscFunctionBegin;
499   /* code only works for square matrices A */
500 
501   /* find size of row to the left of the diagonal part */
502   PetscCall(MatGetOwnershipRange(A, &diag, NULL));
503   row = row - diag;
504   for (l = 0; l < b->i[row + 1] - b->i[row]; l++) {
505     if (garray[b->j[b->i[row] + l]] > diag) break;
506   }
507   if (l) {
508     PetscCall(MatSeqAIJGetArray(mat->B, &ba));
509     PetscCall(PetscArraycpy(ba + b->i[row], v, l));
510     PetscCall(MatSeqAIJRestoreArray(mat->B, &ba));
511   }
512 
513   /* diagonal part */
514   if (a->i[row + 1] - a->i[row]) {
515     PetscCall(MatSeqAIJGetArray(mat->A, &aa));
516     PetscCall(PetscArraycpy(aa + a->i[row], v + l, a->i[row + 1] - a->i[row]));
517     PetscCall(MatSeqAIJRestoreArray(mat->A, &aa));
518   }
519 
520   /* right of diagonal part */
521   if (b->i[row + 1] - b->i[row] - l) {
522     PetscCall(MatSeqAIJGetArray(mat->B, &ba));
523     PetscCall(PetscArraycpy(ba + b->i[row] + l, v + l + a->i[row + 1] - a->i[row], b->i[row + 1] - b->i[row] - l));
524     PetscCall(MatSeqAIJRestoreArray(mat->B, &ba));
525   }
526   PetscFunctionReturn(PETSC_SUCCESS);
527 }
528 
529 PetscErrorCode MatSetValues_MPIAIJ(Mat mat, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode addv)
530 {
531   Mat_MPIAIJ *aij   = (Mat_MPIAIJ *)mat->data;
532   PetscScalar value = 0.0;
533   PetscInt    i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
534   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
535   PetscBool   roworiented = aij->roworiented;
536 
537   /* Some Variables required in the macro */
538   Mat         A     = aij->A;
539   Mat_SeqAIJ *a     = (Mat_SeqAIJ *)A->data;
540   PetscInt   *aimax = a->imax, *ai = a->i, *ailen = a->ilen, *aj = a->j;
541   PetscBool   ignorezeroentries = a->ignorezeroentries;
542   Mat         B                 = aij->B;
543   Mat_SeqAIJ *b                 = (Mat_SeqAIJ *)B->data;
544   PetscInt   *bimax = b->imax, *bi = b->i, *bilen = b->ilen, *bj = b->j, bm = aij->B->rmap->n, am = aij->A->rmap->n;
545   MatScalar  *aa, *ba;
546   PetscInt   *rp1, *rp2, ii, nrow1, nrow2, _i, rmax1, rmax2, N, low1, high1, low2, high2, t, lastcol1, lastcol2;
547   PetscInt    nonew;
548   MatScalar  *ap1, *ap2;
549 
550   PetscFunctionBegin;
551   PetscCall(MatSeqAIJGetArray(A, &aa));
552   PetscCall(MatSeqAIJGetArray(B, &ba));
553   for (i = 0; i < m; i++) {
554     if (im[i] < 0) continue;
555     PetscCheck(im[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, im[i], mat->rmap->N - 1);
556     if (im[i] >= rstart && im[i] < rend) {
557       row      = im[i] - rstart;
558       lastcol1 = -1;
559       rp1      = PetscSafePointerPlusOffset(aj, ai[row]);
560       ap1      = PetscSafePointerPlusOffset(aa, ai[row]);
561       rmax1    = aimax[row];
562       nrow1    = ailen[row];
563       low1     = 0;
564       high1    = nrow1;
565       lastcol2 = -1;
566       rp2      = PetscSafePointerPlusOffset(bj, bi[row]);
567       ap2      = PetscSafePointerPlusOffset(ba, bi[row]);
568       rmax2    = bimax[row];
569       nrow2    = bilen[row];
570       low2     = 0;
571       high2    = nrow2;
572 
573       for (j = 0; j < n; j++) {
574         if (v) value = roworiented ? v[i * n + j] : v[i + j * m];
575         if (ignorezeroentries && value == 0.0 && (addv == ADD_VALUES) && im[i] != in[j]) continue;
576         if (in[j] >= cstart && in[j] < cend) {
577           col   = in[j] - cstart;
578           nonew = a->nonew;
579           MatSetValues_SeqAIJ_A_Private(row, col, value, addv, im[i], in[j]);
580         } else if (in[j] < 0) {
581           continue;
582         } else {
583           PetscCheck(in[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[j], mat->cmap->N - 1);
584           if (mat->was_assembled) {
585             if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
586 #if defined(PETSC_USE_CTABLE)
587             PetscCall(PetscHMapIGetWithDefault(aij->colmap, in[j] + 1, 0, &col)); /* map global col ids to local ones */
588             col--;
589 #else
590             col = aij->colmap[in[j]] - 1;
591 #endif
592             if (col < 0 && !((Mat_SeqAIJ *)aij->B->data)->nonew) { /* col < 0 means in[j] is a new col for B */
593               PetscCall(MatDisAssemble_MPIAIJ(mat, PETSC_FALSE));  /* Change aij->B from reduced/local format to expanded/global format */
594               col = in[j];
595               /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
596               B     = aij->B;
597               b     = (Mat_SeqAIJ *)B->data;
598               bimax = b->imax;
599               bi    = b->i;
600               bilen = b->ilen;
601               bj    = b->j;
602               ba    = b->a;
603               rp2   = PetscSafePointerPlusOffset(bj, bi[row]);
604               ap2   = PetscSafePointerPlusOffset(ba, bi[row]);
605               rmax2 = bimax[row];
606               nrow2 = bilen[row];
607               low2  = 0;
608               high2 = nrow2;
609               bm    = aij->B->rmap->n;
610               ba    = b->a;
611             } else if (col < 0 && !(ignorezeroentries && value == 0.0)) {
612               if (1 == ((Mat_SeqAIJ *)aij->B->data)->nonew) {
613                 PetscCall(PetscInfo(mat, "Skipping of insertion of new nonzero location in off-diagonal portion of matrix %g(%" PetscInt_FMT ",%" PetscInt_FMT ")\n", (double)PetscRealPart(value), im[i], in[j]));
614               } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", im[i], in[j]);
615             }
616           } else col = in[j];
617           nonew = b->nonew;
618           MatSetValues_SeqAIJ_B_Private(row, col, value, addv, im[i], in[j]);
619         }
620       }
621     } else {
622       PetscCheck(!mat->nooffprocentries, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Setting off process row %" PetscInt_FMT " even though MatSetOption(,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) was set", im[i]);
623       if (!aij->donotstash) {
624         mat->assembled = PETSC_FALSE;
625         if (roworiented) {
626           PetscCall(MatStashValuesRow_Private(&mat->stash, im[i], n, in, PetscSafePointerPlusOffset(v, i * n), (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
627         } else {
628           PetscCall(MatStashValuesCol_Private(&mat->stash, im[i], n, in, PetscSafePointerPlusOffset(v, i), m, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
629         }
630       }
631     }
632   }
633   PetscCall(MatSeqAIJRestoreArray(A, &aa)); /* aa, bb might have been free'd due to reallocation above. But we don't access them here */
634   PetscCall(MatSeqAIJRestoreArray(B, &ba));
635   PetscFunctionReturn(PETSC_SUCCESS);
636 }
637 
638 /*
639     This function sets the j and ilen arrays (of the diagonal and off-diagonal part) of an MPIAIJ-matrix.
640     The values in mat_i have to be sorted and the values in mat_j have to be sorted for each row (CSR-like).
641     No off-processor parts off the matrix are allowed here and mat->was_assembled has to be PETSC_FALSE.
642 */
643 PetscErrorCode MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(Mat mat, const PetscInt mat_j[], const PetscInt mat_i[])
644 {
645   Mat_MPIAIJ *aij    = (Mat_MPIAIJ *)mat->data;
646   Mat         A      = aij->A; /* diagonal part of the matrix */
647   Mat         B      = aij->B; /* off-diagonal part of the matrix */
648   Mat_SeqAIJ *a      = (Mat_SeqAIJ *)A->data;
649   Mat_SeqAIJ *b      = (Mat_SeqAIJ *)B->data;
650   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, col;
651   PetscInt   *ailen = a->ilen, *aj = a->j;
652   PetscInt   *bilen = b->ilen, *bj = b->j;
653   PetscInt    am          = aij->A->rmap->n, j;
654   PetscInt    diag_so_far = 0, dnz;
655   PetscInt    offd_so_far = 0, onz;
656 
657   PetscFunctionBegin;
658   /* Iterate over all rows of the matrix */
659   for (j = 0; j < am; j++) {
660     dnz = onz = 0;
661     /*  Iterate over all non-zero columns of the current row */
662     for (col = mat_i[j]; col < mat_i[j + 1]; col++) {
663       /* If column is in the diagonal */
664       if (mat_j[col] >= cstart && mat_j[col] < cend) {
665         aj[diag_so_far++] = mat_j[col] - cstart;
666         dnz++;
667       } else { /* off-diagonal entries */
668         bj[offd_so_far++] = mat_j[col];
669         onz++;
670       }
671     }
672     ailen[j] = dnz;
673     bilen[j] = onz;
674   }
675   PetscFunctionReturn(PETSC_SUCCESS);
676 }
677 
678 /*
679     This function sets the local j, a and ilen arrays (of the diagonal and off-diagonal part) of an MPIAIJ-matrix.
680     The values in mat_i have to be sorted and the values in mat_j have to be sorted for each row (CSR-like).
681     No off-processor parts off the matrix are allowed here, they are set at a later point by MatSetValues_MPIAIJ.
682     Also, mat->was_assembled has to be false, otherwise the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart;
683     would not be true and the more complex MatSetValues_MPIAIJ has to be used.
684 */
685 PetscErrorCode MatSetValues_MPIAIJ_CopyFromCSRFormat(Mat mat, const PetscInt mat_j[], const PetscInt mat_i[], const PetscScalar mat_a[])
686 {
687   Mat_MPIAIJ  *aij  = (Mat_MPIAIJ *)mat->data;
688   Mat          A    = aij->A; /* diagonal part of the matrix */
689   Mat          B    = aij->B; /* off-diagonal part of the matrix */
690   Mat_SeqAIJ  *aijd = (Mat_SeqAIJ *)aij->A->data, *aijo = (Mat_SeqAIJ *)aij->B->data;
691   Mat_SeqAIJ  *a      = (Mat_SeqAIJ *)A->data;
692   Mat_SeqAIJ  *b      = (Mat_SeqAIJ *)B->data;
693   PetscInt     cstart = mat->cmap->rstart, cend = mat->cmap->rend;
694   PetscInt    *ailen = a->ilen, *aj = a->j;
695   PetscInt    *bilen = b->ilen, *bj = b->j;
696   PetscInt     am          = aij->A->rmap->n, j;
697   PetscInt    *full_diag_i = aijd->i, *full_offd_i = aijo->i; /* These variables can also include non-local elements, which are set at a later point. */
698   PetscInt     col, dnz_row, onz_row, rowstart_diag, rowstart_offd;
699   PetscScalar *aa = a->a, *ba = b->a;
700 
701   PetscFunctionBegin;
702   /* Iterate over all rows of the matrix */
703   for (j = 0; j < am; j++) {
704     dnz_row = onz_row = 0;
705     rowstart_offd     = full_offd_i[j];
706     rowstart_diag     = full_diag_i[j];
707     /*  Iterate over all non-zero columns of the current row */
708     for (col = mat_i[j]; col < mat_i[j + 1]; col++) {
709       /* If column is in the diagonal */
710       if (mat_j[col] >= cstart && mat_j[col] < cend) {
711         aj[rowstart_diag + dnz_row] = mat_j[col] - cstart;
712         aa[rowstart_diag + dnz_row] = mat_a[col];
713         dnz_row++;
714       } else { /* off-diagonal entries */
715         bj[rowstart_offd + onz_row] = mat_j[col];
716         ba[rowstart_offd + onz_row] = mat_a[col];
717         onz_row++;
718       }
719     }
720     ailen[j] = dnz_row;
721     bilen[j] = onz_row;
722   }
723   PetscFunctionReturn(PETSC_SUCCESS);
724 }
725 
726 static PetscErrorCode MatGetValues_MPIAIJ(Mat mat, PetscInt m, const PetscInt idxm[], PetscInt n, const PetscInt idxn[], PetscScalar v[])
727 {
728   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
729   PetscInt    i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
730   PetscInt    cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
731 
732   PetscFunctionBegin;
733   for (i = 0; i < m; i++) {
734     if (idxm[i] < 0) continue; /* negative row */
735     PetscCheck(idxm[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, idxm[i], mat->rmap->N - 1);
736     PetscCheck(idxm[i] >= rstart && idxm[i] < rend, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only local values currently supported, row requested %" PetscInt_FMT " range [%" PetscInt_FMT " %" PetscInt_FMT ")", idxm[i], rstart, rend);
737     row = idxm[i] - rstart;
738     for (j = 0; j < n; j++) {
739       if (idxn[j] < 0) continue; /* negative column */
740       PetscCheck(idxn[j] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, idxn[j], mat->cmap->N - 1);
741       if (idxn[j] >= cstart && idxn[j] < cend) {
742         col = idxn[j] - cstart;
743         PetscCall(MatGetValues(aij->A, 1, &row, 1, &col, v + i * n + j));
744       } else {
745         if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
746 #if defined(PETSC_USE_CTABLE)
747         PetscCall(PetscHMapIGetWithDefault(aij->colmap, idxn[j] + 1, 0, &col));
748         col--;
749 #else
750         col = aij->colmap[idxn[j]] - 1;
751 #endif
752         if ((col < 0) || (aij->garray[col] != idxn[j])) *(v + i * n + j) = 0.0;
753         else PetscCall(MatGetValues(aij->B, 1, &row, 1, &col, v + i * n + j));
754       }
755     }
756   }
757   PetscFunctionReturn(PETSC_SUCCESS);
758 }
759 
760 static PetscErrorCode MatAssemblyBegin_MPIAIJ(Mat mat, MatAssemblyType mode)
761 {
762   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
763   PetscInt    nstash, reallocs;
764 
765   PetscFunctionBegin;
766   if (aij->donotstash || mat->nooffprocentries) PetscFunctionReturn(PETSC_SUCCESS);
767 
768   PetscCall(MatStashScatterBegin_Private(mat, &mat->stash, mat->rmap->range));
769   PetscCall(MatStashGetInfo_Private(&mat->stash, &nstash, &reallocs));
770   PetscCall(PetscInfo(aij->A, "Stash has %" PetscInt_FMT " entries, uses %" PetscInt_FMT " mallocs.\n", nstash, reallocs));
771   PetscFunctionReturn(PETSC_SUCCESS);
772 }
773 
774 PetscErrorCode MatAssemblyEnd_MPIAIJ(Mat mat, MatAssemblyType mode)
775 {
776   Mat_MPIAIJ  *aij = (Mat_MPIAIJ *)mat->data;
777   PetscMPIInt  n;
778   PetscInt     i, j, rstart, ncols, flg;
779   PetscInt    *row, *col;
780   PetscBool    all_assembled;
781   PetscScalar *val;
782 
783   /* do not use 'b = (Mat_SeqAIJ*)aij->B->data' as B can be reset in disassembly */
784 
785   PetscFunctionBegin;
786   if (!aij->donotstash && !mat->nooffprocentries) {
787     while (1) {
788       PetscCall(MatStashScatterGetMesg_Private(&mat->stash, &n, &row, &col, &val, &flg));
789       if (!flg) break;
790 
791       for (i = 0; i < n;) {
792         /* Now identify the consecutive vals belonging to the same row */
793         for (j = i, rstart = row[j]; j < n; j++) {
794           if (row[j] != rstart) break;
795         }
796         if (j < n) ncols = j - i;
797         else ncols = n - i;
798         /* Now assemble all these values with a single function call */
799         PetscCall(MatSetValues_MPIAIJ(mat, 1, row + i, ncols, col + i, val + i, mat->insertmode));
800         i = j;
801       }
802     }
803     PetscCall(MatStashScatterEnd_Private(&mat->stash));
804   }
805 #if defined(PETSC_HAVE_DEVICE)
806   if (mat->offloadmask == PETSC_OFFLOAD_CPU) aij->A->offloadmask = PETSC_OFFLOAD_CPU;
807   /* We call MatBindToCPU() on aij->A and aij->B here, because if MatBindToCPU_MPIAIJ() is called before assembly, it cannot bind these. */
808   if (mat->boundtocpu) {
809     PetscCall(MatBindToCPU(aij->A, PETSC_TRUE));
810     PetscCall(MatBindToCPU(aij->B, PETSC_TRUE));
811   }
812 #endif
813   PetscCall(MatAssemblyBegin(aij->A, mode));
814   PetscCall(MatAssemblyEnd(aij->A, mode));
815 
816   /* determine if any process has disassembled, if so we must
817      also disassemble ourself, in order that we may reassemble. */
818   /*
819      if nonzero structure of submatrix B cannot change then we know that
820      no process disassembled thus we can skip this stuff
821   */
822   if (!((Mat_SeqAIJ *)aij->B->data)->nonew) {
823     PetscCallMPI(MPIU_Allreduce(&mat->was_assembled, &all_assembled, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
824     if (mat->was_assembled && !all_assembled) { /* mat on this rank has reduced off-diag B with local col ids, but globally it does not */
825       PetscCall(MatDisAssemble_MPIAIJ(mat, PETSC_FALSE));
826     }
827   }
828   if (!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) PetscCall(MatSetUpMultiply_MPIAIJ(mat));
829   PetscCall(MatSetOption(aij->B, MAT_USE_INODES, PETSC_FALSE));
830 #if defined(PETSC_HAVE_DEVICE)
831   if (mat->offloadmask == PETSC_OFFLOAD_CPU && aij->B->offloadmask != PETSC_OFFLOAD_UNALLOCATED) aij->B->offloadmask = PETSC_OFFLOAD_CPU;
832 #endif
833   PetscCall(MatAssemblyBegin(aij->B, mode));
834   PetscCall(MatAssemblyEnd(aij->B, mode));
835 
836   PetscCall(PetscFree2(aij->rowvalues, aij->rowindices));
837 
838   aij->rowvalues = NULL;
839 
840   PetscCall(VecDestroy(&aij->diag));
841 
842   /* if no new nonzero locations are allowed in matrix then only set the matrix state the first time through */
843   if ((!mat->was_assembled && mode == MAT_FINAL_ASSEMBLY) || !((Mat_SeqAIJ *)aij->A->data)->nonew) {
844     PetscObjectState state = aij->A->nonzerostate + aij->B->nonzerostate;
845     PetscCallMPI(MPIU_Allreduce(&state, &mat->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)mat)));
846   }
847 #if defined(PETSC_HAVE_DEVICE)
848   mat->offloadmask = PETSC_OFFLOAD_BOTH;
849 #endif
850   PetscFunctionReturn(PETSC_SUCCESS);
851 }
852 
853 static PetscErrorCode MatZeroEntries_MPIAIJ(Mat A)
854 {
855   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
856 
857   PetscFunctionBegin;
858   PetscCall(MatZeroEntries(l->A));
859   PetscCall(MatZeroEntries(l->B));
860   PetscFunctionReturn(PETSC_SUCCESS);
861 }
862 
863 static PetscErrorCode MatZeroRows_MPIAIJ(Mat A, PetscInt N, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
864 {
865   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)A->data;
866   PetscInt   *lrows;
867   PetscInt    r, len;
868   PetscBool   cong;
869 
870   PetscFunctionBegin;
871   /* get locally owned rows */
872   PetscCall(MatZeroRowsMapLocal_Private(A, N, rows, &len, &lrows));
873   PetscCall(MatHasCongruentLayouts(A, &cong));
874   /* fix right-hand side if needed */
875   if (x && b) {
876     const PetscScalar *xx;
877     PetscScalar       *bb;
878 
879     PetscCheck(cong, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need matching row/col layout");
880     PetscCall(VecGetArrayRead(x, &xx));
881     PetscCall(VecGetArray(b, &bb));
882     for (r = 0; r < len; ++r) bb[lrows[r]] = diag * xx[lrows[r]];
883     PetscCall(VecRestoreArrayRead(x, &xx));
884     PetscCall(VecRestoreArray(b, &bb));
885   }
886 
887   if (diag != 0.0 && cong) {
888     PetscCall(MatZeroRows(mat->A, len, lrows, diag, NULL, NULL));
889     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
890   } else if (diag != 0.0) { /* non-square or non congruent layouts -> if keepnonzeropattern is false, we allow for new insertion */
891     Mat_SeqAIJ *aijA = (Mat_SeqAIJ *)mat->A->data;
892     Mat_SeqAIJ *aijB = (Mat_SeqAIJ *)mat->B->data;
893     PetscInt    nnwA, nnwB;
894     PetscBool   nnzA, nnzB;
895 
896     nnwA = aijA->nonew;
897     nnwB = aijB->nonew;
898     nnzA = aijA->keepnonzeropattern;
899     nnzB = aijB->keepnonzeropattern;
900     if (!nnzA) {
901       PetscCall(PetscInfo(mat->A, "Requested to not keep the pattern and add a nonzero diagonal; may encounter reallocations on diagonal block.\n"));
902       aijA->nonew = 0;
903     }
904     if (!nnzB) {
905       PetscCall(PetscInfo(mat->B, "Requested to not keep the pattern and add a nonzero diagonal; may encounter reallocations on off-diagonal block.\n"));
906       aijB->nonew = 0;
907     }
908     /* Must zero here before the next loop */
909     PetscCall(MatZeroRows(mat->A, len, lrows, 0.0, NULL, NULL));
910     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
911     for (r = 0; r < len; ++r) {
912       const PetscInt row = lrows[r] + A->rmap->rstart;
913       if (row >= A->cmap->N) continue;
914       PetscCall(MatSetValues(A, 1, &row, 1, &row, &diag, INSERT_VALUES));
915     }
916     aijA->nonew = nnwA;
917     aijB->nonew = nnwB;
918   } else {
919     PetscCall(MatZeroRows(mat->A, len, lrows, 0.0, NULL, NULL));
920     PetscCall(MatZeroRows(mat->B, len, lrows, 0.0, NULL, NULL));
921   }
922   PetscCall(PetscFree(lrows));
923   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
924   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
925 
926   /* only change matrix nonzero state if pattern was allowed to be changed */
927   if (!((Mat_SeqAIJ *)mat->A->data)->keepnonzeropattern || !((Mat_SeqAIJ *)mat->A->data)->nonew) {
928     PetscObjectState state = mat->A->nonzerostate + mat->B->nonzerostate;
929     PetscCallMPI(MPIU_Allreduce(&state, &A->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)A)));
930   }
931   PetscFunctionReturn(PETSC_SUCCESS);
932 }
933 
934 static PetscErrorCode MatZeroRowsColumns_MPIAIJ(Mat A, PetscInt N, const PetscInt rows[], PetscScalar diag, Vec x, Vec b)
935 {
936   Mat_MPIAIJ        *l = (Mat_MPIAIJ *)A->data;
937   PetscInt           n = A->rmap->n;
938   PetscInt           i, j, r, m, len = 0;
939   PetscInt          *lrows, *owners = A->rmap->range;
940   PetscMPIInt        p = 0;
941   PetscSFNode       *rrows;
942   PetscSF            sf;
943   const PetscScalar *xx;
944   PetscScalar       *bb, *mask, *aij_a;
945   Vec                xmask, lmask;
946   Mat_SeqAIJ        *aij = (Mat_SeqAIJ *)l->B->data;
947   const PetscInt    *aj, *ii, *ridx;
948   PetscScalar       *aa;
949 
950   PetscFunctionBegin;
951   /* Create SF where leaves are input rows and roots are owned rows */
952   PetscCall(PetscMalloc1(n, &lrows));
953   for (r = 0; r < n; ++r) lrows[r] = -1;
954   PetscCall(PetscMalloc1(N, &rrows));
955   for (r = 0; r < N; ++r) {
956     const PetscInt idx = rows[r];
957     PetscCheck(idx >= 0 && A->rmap->N > idx, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range [0,%" PetscInt_FMT ")", idx, A->rmap->N);
958     if (idx < owners[p] || owners[p + 1] <= idx) { /* short-circuit the search if the last p owns this row too */
959       PetscCall(PetscLayoutFindOwner(A->rmap, idx, &p));
960     }
961     rrows[r].rank  = p;
962     rrows[r].index = rows[r] - owners[p];
963   }
964   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
965   PetscCall(PetscSFSetGraph(sf, n, N, NULL, PETSC_OWN_POINTER, rrows, PETSC_OWN_POINTER));
966   /* Collect flags for rows to be zeroed */
967   PetscCall(PetscSFReduceBegin(sf, MPIU_INT, (PetscInt *)rows, lrows, MPI_LOR));
968   PetscCall(PetscSFReduceEnd(sf, MPIU_INT, (PetscInt *)rows, lrows, MPI_LOR));
969   PetscCall(PetscSFDestroy(&sf));
970   /* Compress and put in row numbers */
971   for (r = 0; r < n; ++r)
972     if (lrows[r] >= 0) lrows[len++] = r;
973   /* zero diagonal part of matrix */
974   PetscCall(MatZeroRowsColumns(l->A, len, lrows, diag, x, b));
975   /* handle off-diagonal part of matrix */
976   PetscCall(MatCreateVecs(A, &xmask, NULL));
977   PetscCall(VecDuplicate(l->lvec, &lmask));
978   PetscCall(VecGetArray(xmask, &bb));
979   for (i = 0; i < len; i++) bb[lrows[i]] = 1;
980   PetscCall(VecRestoreArray(xmask, &bb));
981   PetscCall(VecScatterBegin(l->Mvctx, xmask, lmask, ADD_VALUES, SCATTER_FORWARD));
982   PetscCall(VecScatterEnd(l->Mvctx, xmask, lmask, ADD_VALUES, SCATTER_FORWARD));
983   PetscCall(VecDestroy(&xmask));
984   if (x && b) { /* this code is buggy when the row and column layout don't match */
985     PetscBool cong;
986 
987     PetscCall(MatHasCongruentLayouts(A, &cong));
988     PetscCheck(cong, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need matching row/col layout");
989     PetscCall(VecScatterBegin(l->Mvctx, x, l->lvec, INSERT_VALUES, SCATTER_FORWARD));
990     PetscCall(VecScatterEnd(l->Mvctx, x, l->lvec, INSERT_VALUES, SCATTER_FORWARD));
991     PetscCall(VecGetArrayRead(l->lvec, &xx));
992     PetscCall(VecGetArray(b, &bb));
993   }
994   PetscCall(VecGetArray(lmask, &mask));
995   /* remove zeroed rows of off-diagonal matrix */
996   PetscCall(MatSeqAIJGetArray(l->B, &aij_a));
997   ii = aij->i;
998   for (i = 0; i < len; i++) PetscCall(PetscArrayzero(PetscSafePointerPlusOffset(aij_a, ii[lrows[i]]), ii[lrows[i] + 1] - ii[lrows[i]]));
999   /* loop over all elements of off process part of matrix zeroing removed columns*/
1000   if (aij->compressedrow.use) {
1001     m    = aij->compressedrow.nrows;
1002     ii   = aij->compressedrow.i;
1003     ridx = aij->compressedrow.rindex;
1004     for (i = 0; i < m; i++) {
1005       n  = ii[i + 1] - ii[i];
1006       aj = aij->j + ii[i];
1007       aa = aij_a + ii[i];
1008 
1009       for (j = 0; j < n; j++) {
1010         if (PetscAbsScalar(mask[*aj])) {
1011           if (b) bb[*ridx] -= *aa * xx[*aj];
1012           *aa = 0.0;
1013         }
1014         aa++;
1015         aj++;
1016       }
1017       ridx++;
1018     }
1019   } else { /* do not use compressed row format */
1020     m = l->B->rmap->n;
1021     for (i = 0; i < m; i++) {
1022       n  = ii[i + 1] - ii[i];
1023       aj = aij->j + ii[i];
1024       aa = aij_a + ii[i];
1025       for (j = 0; j < n; j++) {
1026         if (PetscAbsScalar(mask[*aj])) {
1027           if (b) bb[i] -= *aa * xx[*aj];
1028           *aa = 0.0;
1029         }
1030         aa++;
1031         aj++;
1032       }
1033     }
1034   }
1035   if (x && b) {
1036     PetscCall(VecRestoreArray(b, &bb));
1037     PetscCall(VecRestoreArrayRead(l->lvec, &xx));
1038   }
1039   PetscCall(MatSeqAIJRestoreArray(l->B, &aij_a));
1040   PetscCall(VecRestoreArray(lmask, &mask));
1041   PetscCall(VecDestroy(&lmask));
1042   PetscCall(PetscFree(lrows));
1043 
1044   /* only change matrix nonzero state if pattern was allowed to be changed */
1045   if (!((Mat_SeqAIJ *)l->A->data)->nonew) {
1046     PetscObjectState state = l->A->nonzerostate + l->B->nonzerostate;
1047     PetscCallMPI(MPIU_Allreduce(&state, &A->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)A)));
1048   }
1049   PetscFunctionReturn(PETSC_SUCCESS);
1050 }
1051 
1052 static PetscErrorCode MatMult_MPIAIJ(Mat A, Vec xx, Vec yy)
1053 {
1054   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1055   PetscInt    nt;
1056   VecScatter  Mvctx = a->Mvctx;
1057 
1058   PetscFunctionBegin;
1059   PetscCall(VecGetLocalSize(xx, &nt));
1060   PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt);
1061   PetscCall(VecScatterBegin(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1062   PetscUseTypeMethod(a->A, mult, xx, yy);
1063   PetscCall(VecScatterEnd(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1064   PetscUseTypeMethod(a->B, multadd, a->lvec, yy, yy);
1065   PetscFunctionReturn(PETSC_SUCCESS);
1066 }
1067 
1068 static PetscErrorCode MatMultDiagonalBlock_MPIAIJ(Mat A, Vec bb, Vec xx)
1069 {
1070   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1071 
1072   PetscFunctionBegin;
1073   PetscCall(MatMultDiagonalBlock(a->A, bb, xx));
1074   PetscFunctionReturn(PETSC_SUCCESS);
1075 }
1076 
1077 static PetscErrorCode MatMultAdd_MPIAIJ(Mat A, Vec xx, Vec yy, Vec zz)
1078 {
1079   Mat_MPIAIJ *a     = (Mat_MPIAIJ *)A->data;
1080   VecScatter  Mvctx = a->Mvctx;
1081 
1082   PetscFunctionBegin;
1083   PetscCall(VecScatterBegin(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1084   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
1085   PetscCall(VecScatterEnd(Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
1086   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
1087   PetscFunctionReturn(PETSC_SUCCESS);
1088 }
1089 
1090 static PetscErrorCode MatMultTranspose_MPIAIJ(Mat A, Vec xx, Vec yy)
1091 {
1092   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1093 
1094   PetscFunctionBegin;
1095   /* do nondiagonal part */
1096   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
1097   /* do local part */
1098   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
1099   /* add partial results together */
1100   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
1101   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
1102   PetscFunctionReturn(PETSC_SUCCESS);
1103 }
1104 
1105 static PetscErrorCode MatIsTranspose_MPIAIJ(Mat Amat, Mat Bmat, PetscReal tol, PetscBool *f)
1106 {
1107   MPI_Comm    comm;
1108   Mat_MPIAIJ *Aij = (Mat_MPIAIJ *)Amat->data, *Bij = (Mat_MPIAIJ *)Bmat->data;
1109   Mat         Adia = Aij->A, Bdia = Bij->A, Aoff, Boff, *Aoffs, *Boffs;
1110   IS          Me, Notme;
1111   PetscInt    M, N, first, last, *notme, i;
1112   PetscBool   lf;
1113   PetscMPIInt size;
1114 
1115   PetscFunctionBegin;
1116   /* Easy test: symmetric diagonal block */
1117   PetscCall(MatIsTranspose(Adia, Bdia, tol, &lf));
1118   PetscCallMPI(MPIU_Allreduce(&lf, f, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)Amat)));
1119   if (!*f) PetscFunctionReturn(PETSC_SUCCESS);
1120   PetscCall(PetscObjectGetComm((PetscObject)Amat, &comm));
1121   PetscCallMPI(MPI_Comm_size(comm, &size));
1122   if (size == 1) PetscFunctionReturn(PETSC_SUCCESS);
1123 
1124   /* Hard test: off-diagonal block. This takes a MatCreateSubMatrix. */
1125   PetscCall(MatGetSize(Amat, &M, &N));
1126   PetscCall(MatGetOwnershipRange(Amat, &first, &last));
1127   PetscCall(PetscMalloc1(N - last + first, &notme));
1128   for (i = 0; i < first; i++) notme[i] = i;
1129   for (i = last; i < M; i++) notme[i - last + first] = i;
1130   PetscCall(ISCreateGeneral(MPI_COMM_SELF, N - last + first, notme, PETSC_COPY_VALUES, &Notme));
1131   PetscCall(ISCreateStride(MPI_COMM_SELF, last - first, first, 1, &Me));
1132   PetscCall(MatCreateSubMatrices(Amat, 1, &Me, &Notme, MAT_INITIAL_MATRIX, &Aoffs));
1133   Aoff = Aoffs[0];
1134   PetscCall(MatCreateSubMatrices(Bmat, 1, &Notme, &Me, MAT_INITIAL_MATRIX, &Boffs));
1135   Boff = Boffs[0];
1136   PetscCall(MatIsTranspose(Aoff, Boff, tol, f));
1137   PetscCall(MatDestroyMatrices(1, &Aoffs));
1138   PetscCall(MatDestroyMatrices(1, &Boffs));
1139   PetscCall(ISDestroy(&Me));
1140   PetscCall(ISDestroy(&Notme));
1141   PetscCall(PetscFree(notme));
1142   PetscFunctionReturn(PETSC_SUCCESS);
1143 }
1144 
1145 static PetscErrorCode MatMultTransposeAdd_MPIAIJ(Mat A, Vec xx, Vec yy, Vec zz)
1146 {
1147   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1148 
1149   PetscFunctionBegin;
1150   /* do nondiagonal part */
1151   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
1152   /* do local part */
1153   PetscCall((*a->A->ops->multtransposeadd)(a->A, xx, yy, zz));
1154   /* add partial results together */
1155   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, zz, ADD_VALUES, SCATTER_REVERSE));
1156   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, zz, ADD_VALUES, SCATTER_REVERSE));
1157   PetscFunctionReturn(PETSC_SUCCESS);
1158 }
1159 
1160 /*
1161   This only works correctly for square matrices where the subblock A->A is the
1162    diagonal block
1163 */
1164 static PetscErrorCode MatGetDiagonal_MPIAIJ(Mat A, Vec v)
1165 {
1166   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1167 
1168   PetscFunctionBegin;
1169   PetscCheck(A->rmap->N == A->cmap->N, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Supports only square matrix where A->A is diag block");
1170   PetscCheck(A->rmap->rstart == A->cmap->rstart && A->rmap->rend == A->cmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "row partition must equal col partition");
1171   PetscCall(MatGetDiagonal(a->A, v));
1172   PetscFunctionReturn(PETSC_SUCCESS);
1173 }
1174 
1175 static PetscErrorCode MatScale_MPIAIJ(Mat A, PetscScalar aa)
1176 {
1177   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1178 
1179   PetscFunctionBegin;
1180   PetscCall(MatScale(a->A, aa));
1181   PetscCall(MatScale(a->B, aa));
1182   PetscFunctionReturn(PETSC_SUCCESS);
1183 }
1184 
1185 static PetscErrorCode MatView_MPIAIJ_Binary(Mat mat, PetscViewer viewer)
1186 {
1187   Mat_MPIAIJ        *aij    = (Mat_MPIAIJ *)mat->data;
1188   Mat_SeqAIJ        *A      = (Mat_SeqAIJ *)aij->A->data;
1189   Mat_SeqAIJ        *B      = (Mat_SeqAIJ *)aij->B->data;
1190   const PetscInt    *garray = aij->garray;
1191   const PetscScalar *aa, *ba;
1192   PetscInt           header[4], M, N, m, rs, cs, cnt, i, ja, jb;
1193   PetscInt64         nz, hnz;
1194   PetscInt          *rowlens;
1195   PetscInt          *colidxs;
1196   PetscScalar       *matvals;
1197   PetscMPIInt        rank;
1198 
1199   PetscFunctionBegin;
1200   PetscCall(PetscViewerSetUp(viewer));
1201 
1202   M  = mat->rmap->N;
1203   N  = mat->cmap->N;
1204   m  = mat->rmap->n;
1205   rs = mat->rmap->rstart;
1206   cs = mat->cmap->rstart;
1207   nz = A->nz + B->nz;
1208 
1209   /* write matrix header */
1210   header[0] = MAT_FILE_CLASSID;
1211   header[1] = M;
1212   header[2] = N;
1213   PetscCallMPI(MPI_Reduce(&nz, &hnz, 1, MPIU_INT64, MPI_SUM, 0, PetscObjectComm((PetscObject)mat)));
1214   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
1215   if (rank == 0) PetscCall(PetscIntCast(hnz, &header[3]));
1216   PetscCall(PetscViewerBinaryWrite(viewer, header, 4, PETSC_INT));
1217 
1218   /* fill in and store row lengths  */
1219   PetscCall(PetscMalloc1(m, &rowlens));
1220   for (i = 0; i < m; i++) rowlens[i] = A->i[i + 1] - A->i[i] + B->i[i + 1] - B->i[i];
1221   PetscCall(PetscViewerBinaryWriteAll(viewer, rowlens, m, rs, M, PETSC_INT));
1222   PetscCall(PetscFree(rowlens));
1223 
1224   /* fill in and store column indices */
1225   PetscCall(PetscMalloc1(nz, &colidxs));
1226   for (cnt = 0, i = 0; i < m; i++) {
1227     for (jb = B->i[i]; jb < B->i[i + 1]; jb++) {
1228       if (garray[B->j[jb]] > cs) break;
1229       colidxs[cnt++] = garray[B->j[jb]];
1230     }
1231     for (ja = A->i[i]; ja < A->i[i + 1]; ja++) colidxs[cnt++] = A->j[ja] + cs;
1232     for (; jb < B->i[i + 1]; jb++) colidxs[cnt++] = garray[B->j[jb]];
1233   }
1234   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt64_FMT, cnt, nz);
1235   PetscCall(PetscViewerBinaryWriteAll(viewer, colidxs, nz, PETSC_DETERMINE, PETSC_DETERMINE, PETSC_INT));
1236   PetscCall(PetscFree(colidxs));
1237 
1238   /* fill in and store nonzero values */
1239   PetscCall(MatSeqAIJGetArrayRead(aij->A, &aa));
1240   PetscCall(MatSeqAIJGetArrayRead(aij->B, &ba));
1241   PetscCall(PetscMalloc1(nz, &matvals));
1242   for (cnt = 0, i = 0; i < m; i++) {
1243     for (jb = B->i[i]; jb < B->i[i + 1]; jb++) {
1244       if (garray[B->j[jb]] > cs) break;
1245       matvals[cnt++] = ba[jb];
1246     }
1247     for (ja = A->i[i]; ja < A->i[i + 1]; ja++) matvals[cnt++] = aa[ja];
1248     for (; jb < B->i[i + 1]; jb++) matvals[cnt++] = ba[jb];
1249   }
1250   PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &aa));
1251   PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &ba));
1252   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_LIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt64_FMT, cnt, nz);
1253   PetscCall(PetscViewerBinaryWriteAll(viewer, matvals, nz, PETSC_DETERMINE, PETSC_DETERMINE, PETSC_SCALAR));
1254   PetscCall(PetscFree(matvals));
1255 
1256   /* write block size option to the viewer's .info file */
1257   PetscCall(MatView_Binary_BlockSizes(mat, viewer));
1258   PetscFunctionReturn(PETSC_SUCCESS);
1259 }
1260 
1261 #include <petscdraw.h>
1262 static PetscErrorCode MatView_MPIAIJ_ASCIIorDraworSocket(Mat mat, PetscViewer viewer)
1263 {
1264   Mat_MPIAIJ       *aij  = (Mat_MPIAIJ *)mat->data;
1265   PetscMPIInt       rank = aij->rank, size = aij->size;
1266   PetscBool         isdraw, iascii, isbinary;
1267   PetscViewer       sviewer;
1268   PetscViewerFormat format;
1269 
1270   PetscFunctionBegin;
1271   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1272   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1273   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1274   if (iascii) {
1275     PetscCall(PetscViewerGetFormat(viewer, &format));
1276     if (format == PETSC_VIEWER_LOAD_BALANCE) {
1277       PetscInt i, nmax = 0, nmin = PETSC_INT_MAX, navg = 0, *nz, nzlocal = ((Mat_SeqAIJ *)aij->A->data)->nz + ((Mat_SeqAIJ *)aij->B->data)->nz;
1278       PetscCall(PetscMalloc1(size, &nz));
1279       PetscCallMPI(MPI_Allgather(&nzlocal, 1, MPIU_INT, nz, 1, MPIU_INT, PetscObjectComm((PetscObject)mat)));
1280       for (i = 0; i < size; i++) {
1281         nmax = PetscMax(nmax, nz[i]);
1282         nmin = PetscMin(nmin, nz[i]);
1283         navg += nz[i];
1284       }
1285       PetscCall(PetscFree(nz));
1286       navg = navg / size;
1287       PetscCall(PetscViewerASCIIPrintf(viewer, "Load Balance - Nonzeros: Min %" PetscInt_FMT "  avg %" PetscInt_FMT "  max %" PetscInt_FMT "\n", nmin, navg, nmax));
1288       PetscFunctionReturn(PETSC_SUCCESS);
1289     }
1290     PetscCall(PetscViewerGetFormat(viewer, &format));
1291     if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
1292       MatInfo   info;
1293       PetscInt *inodes = NULL;
1294 
1295       PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)mat), &rank));
1296       PetscCall(MatGetInfo(mat, MAT_LOCAL, &info));
1297       PetscCall(MatInodeGetInodeSizes(aij->A, NULL, &inodes, NULL));
1298       PetscCall(PetscViewerASCIIPushSynchronized(viewer));
1299       if (!inodes) {
1300         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %g, not using I-node routines\n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated,
1301                                                      info.memory));
1302       } else {
1303         PetscCall(
1304           PetscViewerASCIISynchronizedPrintf(viewer, "[%d] Local rows %" PetscInt_FMT " nz %" PetscInt_FMT " nz alloced %" PetscInt_FMT " mem %g, using I-node routines\n", rank, mat->rmap->n, (PetscInt)info.nz_used, (PetscInt)info.nz_allocated, info.memory));
1305       }
1306       PetscCall(MatGetInfo(aij->A, MAT_LOCAL, &info));
1307       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] on-diagonal part: nz %" PetscInt_FMT " \n", rank, (PetscInt)info.nz_used));
1308       PetscCall(MatGetInfo(aij->B, MAT_LOCAL, &info));
1309       PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "[%d] off-diagonal part: nz %" PetscInt_FMT " \n", rank, (PetscInt)info.nz_used));
1310       PetscCall(PetscViewerFlush(viewer));
1311       PetscCall(PetscViewerASCIIPopSynchronized(viewer));
1312       PetscCall(PetscViewerASCIIPrintf(viewer, "Information on VecScatter used in matrix-vector product: \n"));
1313       PetscCall(VecScatterView(aij->Mvctx, viewer));
1314       PetscFunctionReturn(PETSC_SUCCESS);
1315     } else if (format == PETSC_VIEWER_ASCII_INFO) {
1316       PetscInt inodecount, inodelimit, *inodes;
1317       PetscCall(MatInodeGetInodeSizes(aij->A, &inodecount, &inodes, &inodelimit));
1318       if (inodes) {
1319         PetscCall(PetscViewerASCIIPrintf(viewer, "using I-node (on process 0) routines: found %" PetscInt_FMT " nodes, limit used is %" PetscInt_FMT "\n", inodecount, inodelimit));
1320       } else {
1321         PetscCall(PetscViewerASCIIPrintf(viewer, "not using I-node (on process 0) routines\n"));
1322       }
1323       PetscFunctionReturn(PETSC_SUCCESS);
1324     } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO) {
1325       PetscFunctionReturn(PETSC_SUCCESS);
1326     }
1327   } else if (isbinary) {
1328     if (size == 1) {
1329       PetscCall(PetscObjectSetName((PetscObject)aij->A, ((PetscObject)mat)->name));
1330       PetscCall(MatView(aij->A, viewer));
1331     } else {
1332       PetscCall(MatView_MPIAIJ_Binary(mat, viewer));
1333     }
1334     PetscFunctionReturn(PETSC_SUCCESS);
1335   } else if (iascii && size == 1) {
1336     PetscCall(PetscObjectSetName((PetscObject)aij->A, ((PetscObject)mat)->name));
1337     PetscCall(MatView(aij->A, viewer));
1338     PetscFunctionReturn(PETSC_SUCCESS);
1339   } else if (isdraw) {
1340     PetscDraw draw;
1341     PetscBool isnull;
1342     PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1343     PetscCall(PetscDrawIsNull(draw, &isnull));
1344     if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1345   }
1346 
1347   { /* assemble the entire matrix onto first processor */
1348     Mat A = NULL, Av;
1349     IS  isrow, iscol;
1350 
1351     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat), rank == 0 ? mat->rmap->N : 0, 0, 1, &isrow));
1352     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat), rank == 0 ? mat->cmap->N : 0, 0, 1, &iscol));
1353     PetscCall(MatCreateSubMatrix(mat, isrow, iscol, MAT_INITIAL_MATRIX, &A));
1354     PetscCall(MatMPIAIJGetSeqAIJ(A, &Av, NULL, NULL));
1355     /*  The commented code uses MatCreateSubMatrices instead */
1356     /*
1357     Mat *AA, A = NULL, Av;
1358     IS  isrow,iscol;
1359 
1360     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat),rank == 0 ? mat->rmap->N : 0,0,1,&isrow));
1361     PetscCall(ISCreateStride(PetscObjectComm((PetscObject)mat),rank == 0 ? mat->cmap->N : 0,0,1,&iscol));
1362     PetscCall(MatCreateSubMatrices(mat,1,&isrow,&iscol,MAT_INITIAL_MATRIX,&AA));
1363     if (rank == 0) {
1364        PetscCall(PetscObjectReference((PetscObject)AA[0]));
1365        A    = AA[0];
1366        Av   = AA[0];
1367     }
1368     PetscCall(MatDestroySubMatrices(1,&AA));
1369 */
1370     PetscCall(ISDestroy(&iscol));
1371     PetscCall(ISDestroy(&isrow));
1372     /*
1373        Everyone has to call to draw the matrix since the graphics waits are
1374        synchronized across all processors that share the PetscDraw object
1375     */
1376     PetscCall(PetscViewerGetSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
1377     if (rank == 0) {
1378       if (((PetscObject)mat)->name) PetscCall(PetscObjectSetName((PetscObject)Av, ((PetscObject)mat)->name));
1379       PetscCall(MatView_SeqAIJ(Av, sviewer));
1380     }
1381     PetscCall(PetscViewerRestoreSubViewer(viewer, PETSC_COMM_SELF, &sviewer));
1382     PetscCall(MatDestroy(&A));
1383   }
1384   PetscFunctionReturn(PETSC_SUCCESS);
1385 }
1386 
1387 PetscErrorCode MatView_MPIAIJ(Mat mat, PetscViewer viewer)
1388 {
1389   PetscBool iascii, isdraw, issocket, isbinary;
1390 
1391   PetscFunctionBegin;
1392   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1393   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1394   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1395   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERSOCKET, &issocket));
1396   if (iascii || isdraw || isbinary || issocket) PetscCall(MatView_MPIAIJ_ASCIIorDraworSocket(mat, viewer));
1397   PetscFunctionReturn(PETSC_SUCCESS);
1398 }
1399 
1400 static PetscErrorCode MatSOR_MPIAIJ(Mat matin, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1401 {
1402   Mat_MPIAIJ *mat = (Mat_MPIAIJ *)matin->data;
1403   Vec         bb1 = NULL;
1404   PetscBool   hasop;
1405 
1406   PetscFunctionBegin;
1407   if (flag == SOR_APPLY_UPPER) {
1408     PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1409     PetscFunctionReturn(PETSC_SUCCESS);
1410   }
1411 
1412   if (its > 1 || ~flag & SOR_ZERO_INITIAL_GUESS || flag & SOR_EISENSTAT) PetscCall(VecDuplicate(bb, &bb1));
1413 
1414   if ((flag & SOR_LOCAL_SYMMETRIC_SWEEP) == SOR_LOCAL_SYMMETRIC_SWEEP) {
1415     if (flag & SOR_ZERO_INITIAL_GUESS) {
1416       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1417       its--;
1418     }
1419 
1420     while (its--) {
1421       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1422       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1423 
1424       /* update rhs: bb1 = bb - B*x */
1425       PetscCall(VecScale(mat->lvec, -1.0));
1426       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1427 
1428       /* local sweep */
1429       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_SYMMETRIC_SWEEP, fshift, lits, 1, xx));
1430     }
1431   } else if (flag & SOR_LOCAL_FORWARD_SWEEP) {
1432     if (flag & SOR_ZERO_INITIAL_GUESS) {
1433       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1434       its--;
1435     }
1436     while (its--) {
1437       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1438       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1439 
1440       /* update rhs: bb1 = bb - B*x */
1441       PetscCall(VecScale(mat->lvec, -1.0));
1442       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1443 
1444       /* local sweep */
1445       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_FORWARD_SWEEP, fshift, lits, 1, xx));
1446     }
1447   } else if (flag & SOR_LOCAL_BACKWARD_SWEEP) {
1448     if (flag & SOR_ZERO_INITIAL_GUESS) {
1449       PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, flag, fshift, lits, 1, xx));
1450       its--;
1451     }
1452     while (its--) {
1453       PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1454       PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1455 
1456       /* update rhs: bb1 = bb - B*x */
1457       PetscCall(VecScale(mat->lvec, -1.0));
1458       PetscCall((*mat->B->ops->multadd)(mat->B, mat->lvec, bb, bb1));
1459 
1460       /* local sweep */
1461       PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, SOR_BACKWARD_SWEEP, fshift, lits, 1, xx));
1462     }
1463   } else if (flag & SOR_EISENSTAT) {
1464     Vec xx1;
1465 
1466     PetscCall(VecDuplicate(bb, &xx1));
1467     PetscCall((*mat->A->ops->sor)(mat->A, bb, omega, (MatSORType)(SOR_ZERO_INITIAL_GUESS | SOR_LOCAL_BACKWARD_SWEEP), fshift, lits, 1, xx));
1468 
1469     PetscCall(VecScatterBegin(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1470     PetscCall(VecScatterEnd(mat->Mvctx, xx, mat->lvec, INSERT_VALUES, SCATTER_FORWARD));
1471     if (!mat->diag) {
1472       PetscCall(MatCreateVecs(matin, &mat->diag, NULL));
1473       PetscCall(MatGetDiagonal(matin, mat->diag));
1474     }
1475     PetscCall(MatHasOperation(matin, MATOP_MULT_DIAGONAL_BLOCK, &hasop));
1476     if (hasop) {
1477       PetscCall(MatMultDiagonalBlock(matin, xx, bb1));
1478     } else {
1479       PetscCall(VecPointwiseMult(bb1, mat->diag, xx));
1480     }
1481     PetscCall(VecAYPX(bb1, (omega - 2.0) / omega, bb));
1482 
1483     PetscCall(MatMultAdd(mat->B, mat->lvec, bb1, bb1));
1484 
1485     /* local sweep */
1486     PetscCall((*mat->A->ops->sor)(mat->A, bb1, omega, (MatSORType)(SOR_ZERO_INITIAL_GUESS | SOR_LOCAL_FORWARD_SWEEP), fshift, lits, 1, xx1));
1487     PetscCall(VecAXPY(xx, 1.0, xx1));
1488     PetscCall(VecDestroy(&xx1));
1489   } else SETERRQ(PetscObjectComm((PetscObject)matin), PETSC_ERR_SUP, "Parallel SOR not supported");
1490 
1491   PetscCall(VecDestroy(&bb1));
1492 
1493   matin->factorerrortype = mat->A->factorerrortype;
1494   PetscFunctionReturn(PETSC_SUCCESS);
1495 }
1496 
1497 static PetscErrorCode MatPermute_MPIAIJ(Mat A, IS rowp, IS colp, Mat *B)
1498 {
1499   Mat             aA, aB, Aperm;
1500   const PetscInt *rwant, *cwant, *gcols, *ai, *bi, *aj, *bj;
1501   PetscScalar    *aa, *ba;
1502   PetscInt        i, j, m, n, ng, anz, bnz, *dnnz, *onnz, *tdnnz, *tonnz, *rdest, *cdest, *work, *gcdest;
1503   PetscSF         rowsf, sf;
1504   IS              parcolp = NULL;
1505   PetscBool       done;
1506 
1507   PetscFunctionBegin;
1508   PetscCall(MatGetLocalSize(A, &m, &n));
1509   PetscCall(ISGetIndices(rowp, &rwant));
1510   PetscCall(ISGetIndices(colp, &cwant));
1511   PetscCall(PetscMalloc3(PetscMax(m, n), &work, m, &rdest, n, &cdest));
1512 
1513   /* Invert row permutation to find out where my rows should go */
1514   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &rowsf));
1515   PetscCall(PetscSFSetGraphLayout(rowsf, A->rmap, A->rmap->n, NULL, PETSC_OWN_POINTER, rwant));
1516   PetscCall(PetscSFSetFromOptions(rowsf));
1517   for (i = 0; i < m; i++) work[i] = A->rmap->rstart + i;
1518   PetscCall(PetscSFReduceBegin(rowsf, MPIU_INT, work, rdest, MPI_REPLACE));
1519   PetscCall(PetscSFReduceEnd(rowsf, MPIU_INT, work, rdest, MPI_REPLACE));
1520 
1521   /* Invert column permutation to find out where my columns should go */
1522   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1523   PetscCall(PetscSFSetGraphLayout(sf, A->cmap, A->cmap->n, NULL, PETSC_OWN_POINTER, cwant));
1524   PetscCall(PetscSFSetFromOptions(sf));
1525   for (i = 0; i < n; i++) work[i] = A->cmap->rstart + i;
1526   PetscCall(PetscSFReduceBegin(sf, MPIU_INT, work, cdest, MPI_REPLACE));
1527   PetscCall(PetscSFReduceEnd(sf, MPIU_INT, work, cdest, MPI_REPLACE));
1528   PetscCall(PetscSFDestroy(&sf));
1529 
1530   PetscCall(ISRestoreIndices(rowp, &rwant));
1531   PetscCall(ISRestoreIndices(colp, &cwant));
1532   PetscCall(MatMPIAIJGetSeqAIJ(A, &aA, &aB, &gcols));
1533 
1534   /* Find out where my gcols should go */
1535   PetscCall(MatGetSize(aB, NULL, &ng));
1536   PetscCall(PetscMalloc1(ng, &gcdest));
1537   PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1538   PetscCall(PetscSFSetGraphLayout(sf, A->cmap, ng, NULL, PETSC_OWN_POINTER, gcols));
1539   PetscCall(PetscSFSetFromOptions(sf));
1540   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, cdest, gcdest, MPI_REPLACE));
1541   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, cdest, gcdest, MPI_REPLACE));
1542   PetscCall(PetscSFDestroy(&sf));
1543 
1544   PetscCall(PetscCalloc4(m, &dnnz, m, &onnz, m, &tdnnz, m, &tonnz));
1545   PetscCall(MatGetRowIJ(aA, 0, PETSC_FALSE, PETSC_FALSE, &anz, &ai, &aj, &done));
1546   PetscCall(MatGetRowIJ(aB, 0, PETSC_FALSE, PETSC_FALSE, &bnz, &bi, &bj, &done));
1547   for (i = 0; i < m; i++) {
1548     PetscInt    row = rdest[i];
1549     PetscMPIInt rowner;
1550     PetscCall(PetscLayoutFindOwner(A->rmap, row, &rowner));
1551     for (j = ai[i]; j < ai[i + 1]; j++) {
1552       PetscInt    col = cdest[aj[j]];
1553       PetscMPIInt cowner;
1554       PetscCall(PetscLayoutFindOwner(A->cmap, col, &cowner)); /* Could build an index for the columns to eliminate this search */
1555       if (rowner == cowner) dnnz[i]++;
1556       else onnz[i]++;
1557     }
1558     for (j = bi[i]; j < bi[i + 1]; j++) {
1559       PetscInt    col = gcdest[bj[j]];
1560       PetscMPIInt cowner;
1561       PetscCall(PetscLayoutFindOwner(A->cmap, col, &cowner));
1562       if (rowner == cowner) dnnz[i]++;
1563       else onnz[i]++;
1564     }
1565   }
1566   PetscCall(PetscSFBcastBegin(rowsf, MPIU_INT, dnnz, tdnnz, MPI_REPLACE));
1567   PetscCall(PetscSFBcastEnd(rowsf, MPIU_INT, dnnz, tdnnz, MPI_REPLACE));
1568   PetscCall(PetscSFBcastBegin(rowsf, MPIU_INT, onnz, tonnz, MPI_REPLACE));
1569   PetscCall(PetscSFBcastEnd(rowsf, MPIU_INT, onnz, tonnz, MPI_REPLACE));
1570   PetscCall(PetscSFDestroy(&rowsf));
1571 
1572   PetscCall(MatCreateAIJ(PetscObjectComm((PetscObject)A), A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N, 0, tdnnz, 0, tonnz, &Aperm));
1573   PetscCall(MatSeqAIJGetArray(aA, &aa));
1574   PetscCall(MatSeqAIJGetArray(aB, &ba));
1575   for (i = 0; i < m; i++) {
1576     PetscInt *acols = dnnz, *bcols = onnz; /* Repurpose now-unneeded arrays */
1577     PetscInt  j0, rowlen;
1578     rowlen = ai[i + 1] - ai[i];
1579     for (j0 = j = 0; j < rowlen; j0 = j) { /* rowlen could be larger than number of rows m, so sum in batches */
1580       for (; j < PetscMin(rowlen, j0 + m); j++) acols[j - j0] = cdest[aj[ai[i] + j]];
1581       PetscCall(MatSetValues(Aperm, 1, &rdest[i], j - j0, acols, aa + ai[i] + j0, INSERT_VALUES));
1582     }
1583     rowlen = bi[i + 1] - bi[i];
1584     for (j0 = j = 0; j < rowlen; j0 = j) {
1585       for (; j < PetscMin(rowlen, j0 + m); j++) bcols[j - j0] = gcdest[bj[bi[i] + j]];
1586       PetscCall(MatSetValues(Aperm, 1, &rdest[i], j - j0, bcols, ba + bi[i] + j0, INSERT_VALUES));
1587     }
1588   }
1589   PetscCall(MatAssemblyBegin(Aperm, MAT_FINAL_ASSEMBLY));
1590   PetscCall(MatAssemblyEnd(Aperm, MAT_FINAL_ASSEMBLY));
1591   PetscCall(MatRestoreRowIJ(aA, 0, PETSC_FALSE, PETSC_FALSE, &anz, &ai, &aj, &done));
1592   PetscCall(MatRestoreRowIJ(aB, 0, PETSC_FALSE, PETSC_FALSE, &bnz, &bi, &bj, &done));
1593   PetscCall(MatSeqAIJRestoreArray(aA, &aa));
1594   PetscCall(MatSeqAIJRestoreArray(aB, &ba));
1595   PetscCall(PetscFree4(dnnz, onnz, tdnnz, tonnz));
1596   PetscCall(PetscFree3(work, rdest, cdest));
1597   PetscCall(PetscFree(gcdest));
1598   if (parcolp) PetscCall(ISDestroy(&colp));
1599   *B = Aperm;
1600   PetscFunctionReturn(PETSC_SUCCESS);
1601 }
1602 
1603 static PetscErrorCode MatGetGhosts_MPIAIJ(Mat mat, PetscInt *nghosts, const PetscInt *ghosts[])
1604 {
1605   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1606 
1607   PetscFunctionBegin;
1608   PetscCall(MatGetSize(aij->B, NULL, nghosts));
1609   if (ghosts) *ghosts = aij->garray;
1610   PetscFunctionReturn(PETSC_SUCCESS);
1611 }
1612 
1613 static PetscErrorCode MatGetInfo_MPIAIJ(Mat matin, MatInfoType flag, MatInfo *info)
1614 {
1615   Mat_MPIAIJ    *mat = (Mat_MPIAIJ *)matin->data;
1616   Mat            A = mat->A, B = mat->B;
1617   PetscLogDouble isend[5], irecv[5];
1618 
1619   PetscFunctionBegin;
1620   info->block_size = 1.0;
1621   PetscCall(MatGetInfo(A, MAT_LOCAL, info));
1622 
1623   isend[0] = info->nz_used;
1624   isend[1] = info->nz_allocated;
1625   isend[2] = info->nz_unneeded;
1626   isend[3] = info->memory;
1627   isend[4] = info->mallocs;
1628 
1629   PetscCall(MatGetInfo(B, MAT_LOCAL, info));
1630 
1631   isend[0] += info->nz_used;
1632   isend[1] += info->nz_allocated;
1633   isend[2] += info->nz_unneeded;
1634   isend[3] += info->memory;
1635   isend[4] += info->mallocs;
1636   if (flag == MAT_LOCAL) {
1637     info->nz_used      = isend[0];
1638     info->nz_allocated = isend[1];
1639     info->nz_unneeded  = isend[2];
1640     info->memory       = isend[3];
1641     info->mallocs      = isend[4];
1642   } else if (flag == MAT_GLOBAL_MAX) {
1643     PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_MAX, PetscObjectComm((PetscObject)matin)));
1644 
1645     info->nz_used      = irecv[0];
1646     info->nz_allocated = irecv[1];
1647     info->nz_unneeded  = irecv[2];
1648     info->memory       = irecv[3];
1649     info->mallocs      = irecv[4];
1650   } else if (flag == MAT_GLOBAL_SUM) {
1651     PetscCallMPI(MPIU_Allreduce(isend, irecv, 5, MPIU_PETSCLOGDOUBLE, MPI_SUM, PetscObjectComm((PetscObject)matin)));
1652 
1653     info->nz_used      = irecv[0];
1654     info->nz_allocated = irecv[1];
1655     info->nz_unneeded  = irecv[2];
1656     info->memory       = irecv[3];
1657     info->mallocs      = irecv[4];
1658   }
1659   info->fill_ratio_given  = 0; /* no parallel LU/ILU/Cholesky */
1660   info->fill_ratio_needed = 0;
1661   info->factor_mallocs    = 0;
1662   PetscFunctionReturn(PETSC_SUCCESS);
1663 }
1664 
1665 PetscErrorCode MatSetOption_MPIAIJ(Mat A, MatOption op, PetscBool flg)
1666 {
1667   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
1668 
1669   PetscFunctionBegin;
1670   switch (op) {
1671   case MAT_NEW_NONZERO_LOCATIONS:
1672   case MAT_NEW_NONZERO_ALLOCATION_ERR:
1673   case MAT_UNUSED_NONZERO_LOCATION_ERR:
1674   case MAT_KEEP_NONZERO_PATTERN:
1675   case MAT_NEW_NONZERO_LOCATION_ERR:
1676   case MAT_USE_INODES:
1677   case MAT_IGNORE_ZERO_ENTRIES:
1678   case MAT_FORM_EXPLICIT_TRANSPOSE:
1679     MatCheckPreallocated(A, 1);
1680     PetscCall(MatSetOption(a->A, op, flg));
1681     PetscCall(MatSetOption(a->B, op, flg));
1682     break;
1683   case MAT_ROW_ORIENTED:
1684     MatCheckPreallocated(A, 1);
1685     a->roworiented = flg;
1686 
1687     PetscCall(MatSetOption(a->A, op, flg));
1688     PetscCall(MatSetOption(a->B, op, flg));
1689     break;
1690   case MAT_IGNORE_OFF_PROC_ENTRIES:
1691     a->donotstash = flg;
1692     break;
1693   /* Symmetry flags are handled directly by MatSetOption() and they don't affect preallocation */
1694   case MAT_SPD:
1695   case MAT_SYMMETRIC:
1696   case MAT_STRUCTURALLY_SYMMETRIC:
1697   case MAT_HERMITIAN:
1698   case MAT_SYMMETRY_ETERNAL:
1699   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
1700   case MAT_SPD_ETERNAL:
1701     /* if the diagonal matrix is square it inherits some of the properties above */
1702     if (a->A && A->rmap->n == A->cmap->n) PetscCall(MatSetOption(a->A, op, flg));
1703     break;
1704   case MAT_SUBMAT_SINGLEIS:
1705     A->submat_singleis = flg;
1706     break;
1707   default:
1708     break;
1709   }
1710   PetscFunctionReturn(PETSC_SUCCESS);
1711 }
1712 
1713 PetscErrorCode MatGetRow_MPIAIJ(Mat matin, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1714 {
1715   Mat_MPIAIJ  *mat = (Mat_MPIAIJ *)matin->data;
1716   PetscScalar *vworkA, *vworkB, **pvA, **pvB, *v_p;
1717   PetscInt     i, *cworkA, *cworkB, **pcA, **pcB, cstart = matin->cmap->rstart;
1718   PetscInt     nztot, nzA, nzB, lrow, rstart = matin->rmap->rstart, rend = matin->rmap->rend;
1719   PetscInt    *cmap, *idx_p;
1720 
1721   PetscFunctionBegin;
1722   PetscCheck(!mat->getrowactive, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Already active");
1723   mat->getrowactive = PETSC_TRUE;
1724 
1725   if (!mat->rowvalues && (idx || v)) {
1726     /*
1727         allocate enough space to hold information from the longest row.
1728     */
1729     Mat_SeqAIJ *Aa = (Mat_SeqAIJ *)mat->A->data, *Ba = (Mat_SeqAIJ *)mat->B->data;
1730     PetscInt    max = 1, tmp;
1731     for (i = 0; i < matin->rmap->n; i++) {
1732       tmp = Aa->i[i + 1] - Aa->i[i] + Ba->i[i + 1] - Ba->i[i];
1733       if (max < tmp) max = tmp;
1734     }
1735     PetscCall(PetscMalloc2(max, &mat->rowvalues, max, &mat->rowindices));
1736   }
1737 
1738   PetscCheck(row >= rstart && row < rend, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Only local rows");
1739   lrow = row - rstart;
1740 
1741   pvA = &vworkA;
1742   pcA = &cworkA;
1743   pvB = &vworkB;
1744   pcB = &cworkB;
1745   if (!v) {
1746     pvA = NULL;
1747     pvB = NULL;
1748   }
1749   if (!idx) {
1750     pcA = NULL;
1751     if (!v) pcB = NULL;
1752   }
1753   PetscCall((*mat->A->ops->getrow)(mat->A, lrow, &nzA, pcA, pvA));
1754   PetscCall((*mat->B->ops->getrow)(mat->B, lrow, &nzB, pcB, pvB));
1755   nztot = nzA + nzB;
1756 
1757   cmap = mat->garray;
1758   if (v || idx) {
1759     if (nztot) {
1760       /* Sort by increasing column numbers, assuming A and B already sorted */
1761       PetscInt imark = -1;
1762       if (v) {
1763         *v = v_p = mat->rowvalues;
1764         for (i = 0; i < nzB; i++) {
1765           if (cmap[cworkB[i]] < cstart) v_p[i] = vworkB[i];
1766           else break;
1767         }
1768         imark = i;
1769         for (i = 0; i < nzA; i++) v_p[imark + i] = vworkA[i];
1770         for (i = imark; i < nzB; i++) v_p[nzA + i] = vworkB[i];
1771       }
1772       if (idx) {
1773         *idx = idx_p = mat->rowindices;
1774         if (imark > -1) {
1775           for (i = 0; i < imark; i++) idx_p[i] = cmap[cworkB[i]];
1776         } else {
1777           for (i = 0; i < nzB; i++) {
1778             if (cmap[cworkB[i]] < cstart) idx_p[i] = cmap[cworkB[i]];
1779             else break;
1780           }
1781           imark = i;
1782         }
1783         for (i = 0; i < nzA; i++) idx_p[imark + i] = cstart + cworkA[i];
1784         for (i = imark; i < nzB; i++) idx_p[nzA + i] = cmap[cworkB[i]];
1785       }
1786     } else {
1787       if (idx) *idx = NULL;
1788       if (v) *v = NULL;
1789     }
1790   }
1791   *nz = nztot;
1792   PetscCall((*mat->A->ops->restorerow)(mat->A, lrow, &nzA, pcA, pvA));
1793   PetscCall((*mat->B->ops->restorerow)(mat->B, lrow, &nzB, pcB, pvB));
1794   PetscFunctionReturn(PETSC_SUCCESS);
1795 }
1796 
1797 PetscErrorCode MatRestoreRow_MPIAIJ(Mat mat, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1798 {
1799   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1800 
1801   PetscFunctionBegin;
1802   PetscCheck(aij->getrowactive, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "MatGetRow() must be called first");
1803   aij->getrowactive = PETSC_FALSE;
1804   PetscFunctionReturn(PETSC_SUCCESS);
1805 }
1806 
1807 static PetscErrorCode MatNorm_MPIAIJ(Mat mat, NormType type, PetscReal *norm)
1808 {
1809   Mat_MPIAIJ      *aij  = (Mat_MPIAIJ *)mat->data;
1810   Mat_SeqAIJ      *amat = (Mat_SeqAIJ *)aij->A->data, *bmat = (Mat_SeqAIJ *)aij->B->data;
1811   PetscInt         i, j, cstart = mat->cmap->rstart;
1812   PetscReal        sum = 0.0;
1813   const MatScalar *v, *amata, *bmata;
1814 
1815   PetscFunctionBegin;
1816   if (aij->size == 1) {
1817     PetscCall(MatNorm(aij->A, type, norm));
1818   } else {
1819     PetscCall(MatSeqAIJGetArrayRead(aij->A, &amata));
1820     PetscCall(MatSeqAIJGetArrayRead(aij->B, &bmata));
1821     if (type == NORM_FROBENIUS) {
1822       v = amata;
1823       for (i = 0; i < amat->nz; i++) {
1824         sum += PetscRealPart(PetscConj(*v) * (*v));
1825         v++;
1826       }
1827       v = bmata;
1828       for (i = 0; i < bmat->nz; i++) {
1829         sum += PetscRealPart(PetscConj(*v) * (*v));
1830         v++;
1831       }
1832       PetscCallMPI(MPIU_Allreduce(&sum, norm, 1, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)mat)));
1833       *norm = PetscSqrtReal(*norm);
1834       PetscCall(PetscLogFlops(2.0 * amat->nz + 2.0 * bmat->nz));
1835     } else if (type == NORM_1) { /* max column norm */
1836       PetscReal *tmp;
1837       PetscInt  *jj, *garray = aij->garray;
1838       PetscCall(PetscCalloc1(mat->cmap->N + 1, &tmp));
1839       *norm = 0.0;
1840       v     = amata;
1841       jj    = amat->j;
1842       for (j = 0; j < amat->nz; j++) {
1843         tmp[cstart + *jj++] += PetscAbsScalar(*v);
1844         v++;
1845       }
1846       v  = bmata;
1847       jj = bmat->j;
1848       for (j = 0; j < bmat->nz; j++) {
1849         tmp[garray[*jj++]] += PetscAbsScalar(*v);
1850         v++;
1851       }
1852       PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, tmp, mat->cmap->N, MPIU_REAL, MPIU_SUM, PetscObjectComm((PetscObject)mat)));
1853       for (j = 0; j < mat->cmap->N; j++) {
1854         if (tmp[j] > *norm) *norm = tmp[j];
1855       }
1856       PetscCall(PetscFree(tmp));
1857       PetscCall(PetscLogFlops(PetscMax(amat->nz + bmat->nz - 1, 0)));
1858     } else if (type == NORM_INFINITY) { /* max row norm */
1859       PetscReal ntemp = 0.0;
1860       for (j = 0; j < aij->A->rmap->n; j++) {
1861         v   = PetscSafePointerPlusOffset(amata, amat->i[j]);
1862         sum = 0.0;
1863         for (i = 0; i < amat->i[j + 1] - amat->i[j]; i++) {
1864           sum += PetscAbsScalar(*v);
1865           v++;
1866         }
1867         v = PetscSafePointerPlusOffset(bmata, bmat->i[j]);
1868         for (i = 0; i < bmat->i[j + 1] - bmat->i[j]; i++) {
1869           sum += PetscAbsScalar(*v);
1870           v++;
1871         }
1872         if (sum > ntemp) ntemp = sum;
1873       }
1874       PetscCallMPI(MPIU_Allreduce(&ntemp, norm, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)mat)));
1875       PetscCall(PetscLogFlops(PetscMax(amat->nz + bmat->nz - 1, 0)));
1876     } else SETERRQ(PetscObjectComm((PetscObject)mat), PETSC_ERR_SUP, "No support for two norm");
1877     PetscCall(MatSeqAIJRestoreArrayRead(aij->A, &amata));
1878     PetscCall(MatSeqAIJRestoreArrayRead(aij->B, &bmata));
1879   }
1880   PetscFunctionReturn(PETSC_SUCCESS);
1881 }
1882 
1883 static PetscErrorCode MatTranspose_MPIAIJ(Mat A, MatReuse reuse, Mat *matout)
1884 {
1885   Mat_MPIAIJ      *a    = (Mat_MPIAIJ *)A->data, *b;
1886   Mat_SeqAIJ      *Aloc = (Mat_SeqAIJ *)a->A->data, *Bloc = (Mat_SeqAIJ *)a->B->data, *sub_B_diag;
1887   PetscInt         M = A->rmap->N, N = A->cmap->N, ma, na, mb, nb, row, *cols, *cols_tmp, *B_diag_ilen, i, ncol, A_diag_ncol;
1888   const PetscInt  *ai, *aj, *bi, *bj, *B_diag_i;
1889   Mat              B, A_diag, *B_diag;
1890   const MatScalar *pbv, *bv;
1891 
1892   PetscFunctionBegin;
1893   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *matout));
1894   ma = A->rmap->n;
1895   na = A->cmap->n;
1896   mb = a->B->rmap->n;
1897   nb = a->B->cmap->n;
1898   ai = Aloc->i;
1899   aj = Aloc->j;
1900   bi = Bloc->i;
1901   bj = Bloc->j;
1902   if (reuse == MAT_INITIAL_MATRIX || *matout == A) {
1903     PetscInt            *d_nnz, *g_nnz, *o_nnz;
1904     PetscSFNode         *oloc;
1905     PETSC_UNUSED PetscSF sf;
1906 
1907     PetscCall(PetscMalloc4(na, &d_nnz, na, &o_nnz, nb, &g_nnz, nb, &oloc));
1908     /* compute d_nnz for preallocation */
1909     PetscCall(PetscArrayzero(d_nnz, na));
1910     for (i = 0; i < ai[ma]; i++) d_nnz[aj[i]]++;
1911     /* compute local off-diagonal contributions */
1912     PetscCall(PetscArrayzero(g_nnz, nb));
1913     for (i = 0; i < bi[ma]; i++) g_nnz[bj[i]]++;
1914     /* map those to global */
1915     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)A), &sf));
1916     PetscCall(PetscSFSetGraphLayout(sf, A->cmap, nb, NULL, PETSC_USE_POINTER, a->garray));
1917     PetscCall(PetscSFSetFromOptions(sf));
1918     PetscCall(PetscArrayzero(o_nnz, na));
1919     PetscCall(PetscSFReduceBegin(sf, MPIU_INT, g_nnz, o_nnz, MPI_SUM));
1920     PetscCall(PetscSFReduceEnd(sf, MPIU_INT, g_nnz, o_nnz, MPI_SUM));
1921     PetscCall(PetscSFDestroy(&sf));
1922 
1923     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
1924     PetscCall(MatSetSizes(B, A->cmap->n, A->rmap->n, N, M));
1925     PetscCall(MatSetBlockSizes(B, A->cmap->bs, A->rmap->bs));
1926     PetscCall(MatSetType(B, ((PetscObject)A)->type_name));
1927     PetscCall(MatMPIAIJSetPreallocation(B, 0, d_nnz, 0, o_nnz));
1928     PetscCall(PetscFree4(d_nnz, o_nnz, g_nnz, oloc));
1929   } else {
1930     B = *matout;
1931     PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
1932   }
1933 
1934   b           = (Mat_MPIAIJ *)B->data;
1935   A_diag      = a->A;
1936   B_diag      = &b->A;
1937   sub_B_diag  = (Mat_SeqAIJ *)(*B_diag)->data;
1938   A_diag_ncol = A_diag->cmap->N;
1939   B_diag_ilen = sub_B_diag->ilen;
1940   B_diag_i    = sub_B_diag->i;
1941 
1942   /* Set ilen for diagonal of B */
1943   for (i = 0; i < A_diag_ncol; i++) B_diag_ilen[i] = B_diag_i[i + 1] - B_diag_i[i];
1944 
1945   /* Transpose the diagonal part of the matrix. In contrast to the off-diagonal part, this can be done
1946   very quickly (=without using MatSetValues), because all writes are local. */
1947   PetscCall(MatTransposeSetPrecursor(A_diag, *B_diag));
1948   PetscCall(MatTranspose(A_diag, MAT_REUSE_MATRIX, B_diag));
1949 
1950   /* copy over the B part */
1951   PetscCall(PetscMalloc1(bi[mb], &cols));
1952   PetscCall(MatSeqAIJGetArrayRead(a->B, &bv));
1953   pbv = bv;
1954   row = A->rmap->rstart;
1955   for (i = 0; i < bi[mb]; i++) cols[i] = a->garray[bj[i]];
1956   cols_tmp = cols;
1957   for (i = 0; i < mb; i++) {
1958     ncol = bi[i + 1] - bi[i];
1959     PetscCall(MatSetValues(B, ncol, cols_tmp, 1, &row, pbv, INSERT_VALUES));
1960     row++;
1961     if (pbv) pbv += ncol;
1962     if (cols_tmp) cols_tmp += ncol;
1963   }
1964   PetscCall(PetscFree(cols));
1965   PetscCall(MatSeqAIJRestoreArrayRead(a->B, &bv));
1966 
1967   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1968   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1969   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) {
1970     *matout = B;
1971   } else {
1972     PetscCall(MatHeaderMerge(A, &B));
1973   }
1974   PetscFunctionReturn(PETSC_SUCCESS);
1975 }
1976 
1977 static PetscErrorCode MatDiagonalScale_MPIAIJ(Mat mat, Vec ll, Vec rr)
1978 {
1979   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
1980   Mat         a = aij->A, b = aij->B;
1981   PetscInt    s1, s2, s3;
1982 
1983   PetscFunctionBegin;
1984   PetscCall(MatGetLocalSize(mat, &s2, &s3));
1985   if (rr) {
1986     PetscCall(VecGetLocalSize(rr, &s1));
1987     PetscCheck(s1 == s3, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "right vector non-conforming local size");
1988     /* Overlap communication with computation. */
1989     PetscCall(VecScatterBegin(aij->Mvctx, rr, aij->lvec, INSERT_VALUES, SCATTER_FORWARD));
1990   }
1991   if (ll) {
1992     PetscCall(VecGetLocalSize(ll, &s1));
1993     PetscCheck(s1 == s2, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "left vector non-conforming local size");
1994     PetscUseTypeMethod(b, diagonalscale, ll, NULL);
1995   }
1996   /* scale  the diagonal block */
1997   PetscUseTypeMethod(a, diagonalscale, ll, rr);
1998 
1999   if (rr) {
2000     /* Do a scatter end and then right scale the off-diagonal block */
2001     PetscCall(VecScatterEnd(aij->Mvctx, rr, aij->lvec, INSERT_VALUES, SCATTER_FORWARD));
2002     PetscUseTypeMethod(b, diagonalscale, NULL, aij->lvec);
2003   }
2004   PetscFunctionReturn(PETSC_SUCCESS);
2005 }
2006 
2007 static PetscErrorCode MatSetUnfactored_MPIAIJ(Mat A)
2008 {
2009   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2010 
2011   PetscFunctionBegin;
2012   PetscCall(MatSetUnfactored(a->A));
2013   PetscFunctionReturn(PETSC_SUCCESS);
2014 }
2015 
2016 static PetscErrorCode MatEqual_MPIAIJ(Mat A, Mat B, PetscBool *flag)
2017 {
2018   Mat_MPIAIJ *matB = (Mat_MPIAIJ *)B->data, *matA = (Mat_MPIAIJ *)A->data;
2019   Mat         a, b, c, d;
2020   PetscBool   flg;
2021 
2022   PetscFunctionBegin;
2023   a = matA->A;
2024   b = matA->B;
2025   c = matB->A;
2026   d = matB->B;
2027 
2028   PetscCall(MatEqual(a, c, &flg));
2029   if (flg) PetscCall(MatEqual(b, d, &flg));
2030   PetscCallMPI(MPIU_Allreduce(&flg, flag, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)A)));
2031   PetscFunctionReturn(PETSC_SUCCESS);
2032 }
2033 
2034 static PetscErrorCode MatCopy_MPIAIJ(Mat A, Mat B, MatStructure str)
2035 {
2036   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2037   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2038 
2039   PetscFunctionBegin;
2040   /* If the two matrices don't have the same copy implementation, they aren't compatible for fast copy. */
2041   if ((str != SAME_NONZERO_PATTERN) || (A->ops->copy != B->ops->copy)) {
2042     /* because of the column compression in the off-processor part of the matrix a->B,
2043        the number of columns in a->B and b->B may be different, hence we cannot call
2044        the MatCopy() directly on the two parts. If need be, we can provide a more
2045        efficient copy than the MatCopy_Basic() by first uncompressing the a->B matrices
2046        then copying the submatrices */
2047     PetscCall(MatCopy_Basic(A, B, str));
2048   } else {
2049     PetscCall(MatCopy(a->A, b->A, str));
2050     PetscCall(MatCopy(a->B, b->B, str));
2051   }
2052   PetscCall(PetscObjectStateIncrease((PetscObject)B));
2053   PetscFunctionReturn(PETSC_SUCCESS);
2054 }
2055 
2056 /*
2057    Computes the number of nonzeros per row needed for preallocation when X and Y
2058    have different nonzero structure.
2059 */
2060 PetscErrorCode MatAXPYGetPreallocation_MPIX_private(PetscInt m, const PetscInt *xi, const PetscInt *xj, const PetscInt *xltog, const PetscInt *yi, const PetscInt *yj, const PetscInt *yltog, PetscInt *nnz)
2061 {
2062   PetscInt i, j, k, nzx, nzy;
2063 
2064   PetscFunctionBegin;
2065   /* Set the number of nonzeros in the new matrix */
2066   for (i = 0; i < m; i++) {
2067     const PetscInt *xjj = PetscSafePointerPlusOffset(xj, xi[i]), *yjj = PetscSafePointerPlusOffset(yj, yi[i]);
2068     nzx    = xi[i + 1] - xi[i];
2069     nzy    = yi[i + 1] - yi[i];
2070     nnz[i] = 0;
2071     for (j = 0, k = 0; j < nzx; j++) {                                /* Point in X */
2072       for (; k < nzy && yltog[yjj[k]] < xltog[xjj[j]]; k++) nnz[i]++; /* Catch up to X */
2073       if (k < nzy && yltog[yjj[k]] == xltog[xjj[j]]) k++;             /* Skip duplicate */
2074       nnz[i]++;
2075     }
2076     for (; k < nzy; k++) nnz[i]++;
2077   }
2078   PetscFunctionReturn(PETSC_SUCCESS);
2079 }
2080 
2081 /* This is the same as MatAXPYGetPreallocation_SeqAIJ, except that the local-to-global map is provided */
2082 static PetscErrorCode MatAXPYGetPreallocation_MPIAIJ(Mat Y, const PetscInt *yltog, Mat X, const PetscInt *xltog, PetscInt *nnz)
2083 {
2084   PetscInt    m = Y->rmap->N;
2085   Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data;
2086   Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data;
2087 
2088   PetscFunctionBegin;
2089   PetscCall(MatAXPYGetPreallocation_MPIX_private(m, x->i, x->j, xltog, y->i, y->j, yltog, nnz));
2090   PetscFunctionReturn(PETSC_SUCCESS);
2091 }
2092 
2093 static PetscErrorCode MatAXPY_MPIAIJ(Mat Y, PetscScalar a, Mat X, MatStructure str)
2094 {
2095   Mat_MPIAIJ *xx = (Mat_MPIAIJ *)X->data, *yy = (Mat_MPIAIJ *)Y->data;
2096 
2097   PetscFunctionBegin;
2098   if (str == SAME_NONZERO_PATTERN) {
2099     PetscCall(MatAXPY(yy->A, a, xx->A, str));
2100     PetscCall(MatAXPY(yy->B, a, xx->B, str));
2101   } else if (str == SUBSET_NONZERO_PATTERN) { /* nonzeros of X is a subset of Y's */
2102     PetscCall(MatAXPY_Basic(Y, a, X, str));
2103   } else {
2104     Mat       B;
2105     PetscInt *nnz_d, *nnz_o;
2106 
2107     PetscCall(PetscMalloc1(yy->A->rmap->N, &nnz_d));
2108     PetscCall(PetscMalloc1(yy->B->rmap->N, &nnz_o));
2109     PetscCall(MatCreate(PetscObjectComm((PetscObject)Y), &B));
2110     PetscCall(PetscObjectSetName((PetscObject)B, ((PetscObject)Y)->name));
2111     PetscCall(MatSetLayouts(B, Y->rmap, Y->cmap));
2112     PetscCall(MatSetType(B, ((PetscObject)Y)->type_name));
2113     PetscCall(MatAXPYGetPreallocation_SeqAIJ(yy->A, xx->A, nnz_d));
2114     PetscCall(MatAXPYGetPreallocation_MPIAIJ(yy->B, yy->garray, xx->B, xx->garray, nnz_o));
2115     PetscCall(MatMPIAIJSetPreallocation(B, 0, nnz_d, 0, nnz_o));
2116     PetscCall(MatAXPY_BasicWithPreallocation(B, Y, a, X, str));
2117     PetscCall(MatHeaderMerge(Y, &B));
2118     PetscCall(PetscFree(nnz_d));
2119     PetscCall(PetscFree(nnz_o));
2120   }
2121   PetscFunctionReturn(PETSC_SUCCESS);
2122 }
2123 
2124 PETSC_INTERN PetscErrorCode MatConjugate_SeqAIJ(Mat);
2125 
2126 static PetscErrorCode MatConjugate_MPIAIJ(Mat mat)
2127 {
2128   PetscFunctionBegin;
2129   if (PetscDefined(USE_COMPLEX)) {
2130     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2131 
2132     PetscCall(MatConjugate_SeqAIJ(aij->A));
2133     PetscCall(MatConjugate_SeqAIJ(aij->B));
2134   }
2135   PetscFunctionReturn(PETSC_SUCCESS);
2136 }
2137 
2138 static PetscErrorCode MatRealPart_MPIAIJ(Mat A)
2139 {
2140   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2141 
2142   PetscFunctionBegin;
2143   PetscCall(MatRealPart(a->A));
2144   PetscCall(MatRealPart(a->B));
2145   PetscFunctionReturn(PETSC_SUCCESS);
2146 }
2147 
2148 static PetscErrorCode MatImaginaryPart_MPIAIJ(Mat A)
2149 {
2150   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2151 
2152   PetscFunctionBegin;
2153   PetscCall(MatImaginaryPart(a->A));
2154   PetscCall(MatImaginaryPart(a->B));
2155   PetscFunctionReturn(PETSC_SUCCESS);
2156 }
2157 
2158 static PetscErrorCode MatGetRowMaxAbs_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2159 {
2160   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data;
2161   PetscInt           i, *idxb = NULL, m = A->rmap->n;
2162   PetscScalar       *vv;
2163   Vec                vB, vA;
2164   const PetscScalar *va, *vb;
2165 
2166   PetscFunctionBegin;
2167   PetscCall(MatCreateVecs(a->A, NULL, &vA));
2168   PetscCall(MatGetRowMaxAbs(a->A, vA, idx));
2169 
2170   PetscCall(VecGetArrayRead(vA, &va));
2171   if (idx) {
2172     for (i = 0; i < m; i++) {
2173       if (PetscAbsScalar(va[i])) idx[i] += A->cmap->rstart;
2174     }
2175   }
2176 
2177   PetscCall(MatCreateVecs(a->B, NULL, &vB));
2178   PetscCall(PetscMalloc1(m, &idxb));
2179   PetscCall(MatGetRowMaxAbs(a->B, vB, idxb));
2180 
2181   PetscCall(VecGetArrayWrite(v, &vv));
2182   PetscCall(VecGetArrayRead(vB, &vb));
2183   for (i = 0; i < m; i++) {
2184     if (PetscAbsScalar(va[i]) < PetscAbsScalar(vb[i])) {
2185       vv[i] = vb[i];
2186       if (idx) idx[i] = a->garray[idxb[i]];
2187     } else {
2188       vv[i] = va[i];
2189       if (idx && PetscAbsScalar(va[i]) == PetscAbsScalar(vb[i]) && idxb[i] != -1 && idx[i] > a->garray[idxb[i]]) idx[i] = a->garray[idxb[i]];
2190     }
2191   }
2192   PetscCall(VecRestoreArrayWrite(v, &vv));
2193   PetscCall(VecRestoreArrayRead(vA, &va));
2194   PetscCall(VecRestoreArrayRead(vB, &vb));
2195   PetscCall(PetscFree(idxb));
2196   PetscCall(VecDestroy(&vA));
2197   PetscCall(VecDestroy(&vB));
2198   PetscFunctionReturn(PETSC_SUCCESS);
2199 }
2200 
2201 static PetscErrorCode MatGetRowSumAbs_MPIAIJ(Mat A, Vec v)
2202 {
2203   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2204   Vec         vB, vA;
2205 
2206   PetscFunctionBegin;
2207   PetscCall(MatCreateVecs(a->A, NULL, &vA));
2208   PetscCall(MatGetRowSumAbs(a->A, vA));
2209   PetscCall(MatCreateVecs(a->B, NULL, &vB));
2210   PetscCall(MatGetRowSumAbs(a->B, vB));
2211   PetscCall(VecAXPY(vA, 1.0, vB));
2212   PetscCall(VecDestroy(&vB));
2213   PetscCall(VecCopy(vA, v));
2214   PetscCall(VecDestroy(&vA));
2215   PetscFunctionReturn(PETSC_SUCCESS);
2216 }
2217 
2218 static PetscErrorCode MatGetRowMinAbs_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2219 {
2220   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2221   PetscInt           m = A->rmap->n, n = A->cmap->n;
2222   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2223   PetscInt          *cmap = mat->garray;
2224   PetscInt          *diagIdx, *offdiagIdx;
2225   Vec                diagV, offdiagV;
2226   PetscScalar       *a, *diagA, *offdiagA;
2227   const PetscScalar *ba, *bav;
2228   PetscInt           r, j, col, ncols, *bi, *bj;
2229   Mat                B = mat->B;
2230   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2231 
2232   PetscFunctionBegin;
2233   /* When a process holds entire A and other processes have no entry */
2234   if (A->cmap->N == n) {
2235     PetscCall(VecGetArrayWrite(v, &diagA));
2236     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2237     PetscCall(MatGetRowMinAbs(mat->A, diagV, idx));
2238     PetscCall(VecDestroy(&diagV));
2239     PetscCall(VecRestoreArrayWrite(v, &diagA));
2240     PetscFunctionReturn(PETSC_SUCCESS);
2241   } else if (n == 0) {
2242     if (m) {
2243       PetscCall(VecGetArrayWrite(v, &a));
2244       for (r = 0; r < m; r++) {
2245         a[r] = 0.0;
2246         if (idx) idx[r] = -1;
2247       }
2248       PetscCall(VecRestoreArrayWrite(v, &a));
2249     }
2250     PetscFunctionReturn(PETSC_SUCCESS);
2251   }
2252 
2253   PetscCall(PetscMalloc2(m, &diagIdx, m, &offdiagIdx));
2254   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2255   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2256   PetscCall(MatGetRowMinAbs(mat->A, diagV, diagIdx));
2257 
2258   /* Get offdiagIdx[] for implicit 0.0 */
2259   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2260   ba = bav;
2261   bi = b->i;
2262   bj = b->j;
2263   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2264   for (r = 0; r < m; r++) {
2265     ncols = bi[r + 1] - bi[r];
2266     if (ncols == A->cmap->N - n) { /* Brow is dense */
2267       offdiagA[r]   = *ba;
2268       offdiagIdx[r] = cmap[0];
2269     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2270       offdiagA[r] = 0.0;
2271 
2272       /* Find first hole in the cmap */
2273       for (j = 0; j < ncols; j++) {
2274         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2275         if (col > j && j < cstart) {
2276           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2277           break;
2278         } else if (col > j + n && j >= cstart) {
2279           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2280           break;
2281         }
2282       }
2283       if (j == ncols && ncols < A->cmap->N - n) {
2284         /* a hole is outside compressed Bcols */
2285         if (ncols == 0) {
2286           if (cstart) {
2287             offdiagIdx[r] = 0;
2288           } else offdiagIdx[r] = cend;
2289         } else { /* ncols > 0 */
2290           offdiagIdx[r] = cmap[ncols - 1] + 1;
2291           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2292         }
2293       }
2294     }
2295 
2296     for (j = 0; j < ncols; j++) {
2297       if (PetscAbsScalar(offdiagA[r]) > PetscAbsScalar(*ba)) {
2298         offdiagA[r]   = *ba;
2299         offdiagIdx[r] = cmap[*bj];
2300       }
2301       ba++;
2302       bj++;
2303     }
2304   }
2305 
2306   PetscCall(VecGetArrayWrite(v, &a));
2307   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2308   for (r = 0; r < m; ++r) {
2309     if (PetscAbsScalar(diagA[r]) < PetscAbsScalar(offdiagA[r])) {
2310       a[r] = diagA[r];
2311       if (idx) idx[r] = cstart + diagIdx[r];
2312     } else if (PetscAbsScalar(diagA[r]) == PetscAbsScalar(offdiagA[r])) {
2313       a[r] = diagA[r];
2314       if (idx) {
2315         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2316           idx[r] = cstart + diagIdx[r];
2317         } else idx[r] = offdiagIdx[r];
2318       }
2319     } else {
2320       a[r] = offdiagA[r];
2321       if (idx) idx[r] = offdiagIdx[r];
2322     }
2323   }
2324   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2325   PetscCall(VecRestoreArrayWrite(v, &a));
2326   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2327   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2328   PetscCall(VecDestroy(&diagV));
2329   PetscCall(VecDestroy(&offdiagV));
2330   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2331   PetscFunctionReturn(PETSC_SUCCESS);
2332 }
2333 
2334 static PetscErrorCode MatGetRowMin_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2335 {
2336   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2337   PetscInt           m = A->rmap->n, n = A->cmap->n;
2338   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2339   PetscInt          *cmap = mat->garray;
2340   PetscInt          *diagIdx, *offdiagIdx;
2341   Vec                diagV, offdiagV;
2342   PetscScalar       *a, *diagA, *offdiagA;
2343   const PetscScalar *ba, *bav;
2344   PetscInt           r, j, col, ncols, *bi, *bj;
2345   Mat                B = mat->B;
2346   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2347 
2348   PetscFunctionBegin;
2349   /* When a process holds entire A and other processes have no entry */
2350   if (A->cmap->N == n) {
2351     PetscCall(VecGetArrayWrite(v, &diagA));
2352     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2353     PetscCall(MatGetRowMin(mat->A, diagV, idx));
2354     PetscCall(VecDestroy(&diagV));
2355     PetscCall(VecRestoreArrayWrite(v, &diagA));
2356     PetscFunctionReturn(PETSC_SUCCESS);
2357   } else if (n == 0) {
2358     if (m) {
2359       PetscCall(VecGetArrayWrite(v, &a));
2360       for (r = 0; r < m; r++) {
2361         a[r] = PETSC_MAX_REAL;
2362         if (idx) idx[r] = -1;
2363       }
2364       PetscCall(VecRestoreArrayWrite(v, &a));
2365     }
2366     PetscFunctionReturn(PETSC_SUCCESS);
2367   }
2368 
2369   PetscCall(PetscCalloc2(m, &diagIdx, m, &offdiagIdx));
2370   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2371   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2372   PetscCall(MatGetRowMin(mat->A, diagV, diagIdx));
2373 
2374   /* Get offdiagIdx[] for implicit 0.0 */
2375   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2376   ba = bav;
2377   bi = b->i;
2378   bj = b->j;
2379   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2380   for (r = 0; r < m; r++) {
2381     ncols = bi[r + 1] - bi[r];
2382     if (ncols == A->cmap->N - n) { /* Brow is dense */
2383       offdiagA[r]   = *ba;
2384       offdiagIdx[r] = cmap[0];
2385     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2386       offdiagA[r] = 0.0;
2387 
2388       /* Find first hole in the cmap */
2389       for (j = 0; j < ncols; j++) {
2390         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2391         if (col > j && j < cstart) {
2392           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2393           break;
2394         } else if (col > j + n && j >= cstart) {
2395           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2396           break;
2397         }
2398       }
2399       if (j == ncols && ncols < A->cmap->N - n) {
2400         /* a hole is outside compressed Bcols */
2401         if (ncols == 0) {
2402           if (cstart) {
2403             offdiagIdx[r] = 0;
2404           } else offdiagIdx[r] = cend;
2405         } else { /* ncols > 0 */
2406           offdiagIdx[r] = cmap[ncols - 1] + 1;
2407           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2408         }
2409       }
2410     }
2411 
2412     for (j = 0; j < ncols; j++) {
2413       if (PetscRealPart(offdiagA[r]) > PetscRealPart(*ba)) {
2414         offdiagA[r]   = *ba;
2415         offdiagIdx[r] = cmap[*bj];
2416       }
2417       ba++;
2418       bj++;
2419     }
2420   }
2421 
2422   PetscCall(VecGetArrayWrite(v, &a));
2423   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2424   for (r = 0; r < m; ++r) {
2425     if (PetscRealPart(diagA[r]) < PetscRealPart(offdiagA[r])) {
2426       a[r] = diagA[r];
2427       if (idx) idx[r] = cstart + diagIdx[r];
2428     } else if (PetscRealPart(diagA[r]) == PetscRealPart(offdiagA[r])) {
2429       a[r] = diagA[r];
2430       if (idx) {
2431         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2432           idx[r] = cstart + diagIdx[r];
2433         } else idx[r] = offdiagIdx[r];
2434       }
2435     } else {
2436       a[r] = offdiagA[r];
2437       if (idx) idx[r] = offdiagIdx[r];
2438     }
2439   }
2440   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2441   PetscCall(VecRestoreArrayWrite(v, &a));
2442   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2443   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2444   PetscCall(VecDestroy(&diagV));
2445   PetscCall(VecDestroy(&offdiagV));
2446   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2447   PetscFunctionReturn(PETSC_SUCCESS);
2448 }
2449 
2450 static PetscErrorCode MatGetRowMax_MPIAIJ(Mat A, Vec v, PetscInt idx[])
2451 {
2452   Mat_MPIAIJ        *mat = (Mat_MPIAIJ *)A->data;
2453   PetscInt           m = A->rmap->n, n = A->cmap->n;
2454   PetscInt           cstart = A->cmap->rstart, cend = A->cmap->rend;
2455   PetscInt          *cmap = mat->garray;
2456   PetscInt          *diagIdx, *offdiagIdx;
2457   Vec                diagV, offdiagV;
2458   PetscScalar       *a, *diagA, *offdiagA;
2459   const PetscScalar *ba, *bav;
2460   PetscInt           r, j, col, ncols, *bi, *bj;
2461   Mat                B = mat->B;
2462   Mat_SeqAIJ        *b = (Mat_SeqAIJ *)B->data;
2463 
2464   PetscFunctionBegin;
2465   /* When a process holds entire A and other processes have no entry */
2466   if (A->cmap->N == n) {
2467     PetscCall(VecGetArrayWrite(v, &diagA));
2468     PetscCall(VecCreateSeqWithArray(PETSC_COMM_SELF, 1, m, diagA, &diagV));
2469     PetscCall(MatGetRowMax(mat->A, diagV, idx));
2470     PetscCall(VecDestroy(&diagV));
2471     PetscCall(VecRestoreArrayWrite(v, &diagA));
2472     PetscFunctionReturn(PETSC_SUCCESS);
2473   } else if (n == 0) {
2474     if (m) {
2475       PetscCall(VecGetArrayWrite(v, &a));
2476       for (r = 0; r < m; r++) {
2477         a[r] = PETSC_MIN_REAL;
2478         if (idx) idx[r] = -1;
2479       }
2480       PetscCall(VecRestoreArrayWrite(v, &a));
2481     }
2482     PetscFunctionReturn(PETSC_SUCCESS);
2483   }
2484 
2485   PetscCall(PetscMalloc2(m, &diagIdx, m, &offdiagIdx));
2486   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &diagV));
2487   PetscCall(VecCreateSeq(PETSC_COMM_SELF, m, &offdiagV));
2488   PetscCall(MatGetRowMax(mat->A, diagV, diagIdx));
2489 
2490   /* Get offdiagIdx[] for implicit 0.0 */
2491   PetscCall(MatSeqAIJGetArrayRead(B, &bav));
2492   ba = bav;
2493   bi = b->i;
2494   bj = b->j;
2495   PetscCall(VecGetArrayWrite(offdiagV, &offdiagA));
2496   for (r = 0; r < m; r++) {
2497     ncols = bi[r + 1] - bi[r];
2498     if (ncols == A->cmap->N - n) { /* Brow is dense */
2499       offdiagA[r]   = *ba;
2500       offdiagIdx[r] = cmap[0];
2501     } else { /* Brow is sparse so already KNOW maximum is 0.0 or higher */
2502       offdiagA[r] = 0.0;
2503 
2504       /* Find first hole in the cmap */
2505       for (j = 0; j < ncols; j++) {
2506         col = cmap[bj[j]]; /* global column number = cmap[B column number] */
2507         if (col > j && j < cstart) {
2508           offdiagIdx[r] = j; /* global column number of first implicit 0.0 */
2509           break;
2510         } else if (col > j + n && j >= cstart) {
2511           offdiagIdx[r] = j + n; /* global column number of first implicit 0.0 */
2512           break;
2513         }
2514       }
2515       if (j == ncols && ncols < A->cmap->N - n) {
2516         /* a hole is outside compressed Bcols */
2517         if (ncols == 0) {
2518           if (cstart) {
2519             offdiagIdx[r] = 0;
2520           } else offdiagIdx[r] = cend;
2521         } else { /* ncols > 0 */
2522           offdiagIdx[r] = cmap[ncols - 1] + 1;
2523           if (offdiagIdx[r] == cstart) offdiagIdx[r] += n;
2524         }
2525       }
2526     }
2527 
2528     for (j = 0; j < ncols; j++) {
2529       if (PetscRealPart(offdiagA[r]) < PetscRealPart(*ba)) {
2530         offdiagA[r]   = *ba;
2531         offdiagIdx[r] = cmap[*bj];
2532       }
2533       ba++;
2534       bj++;
2535     }
2536   }
2537 
2538   PetscCall(VecGetArrayWrite(v, &a));
2539   PetscCall(VecGetArrayRead(diagV, (const PetscScalar **)&diagA));
2540   for (r = 0; r < m; ++r) {
2541     if (PetscRealPart(diagA[r]) > PetscRealPart(offdiagA[r])) {
2542       a[r] = diagA[r];
2543       if (idx) idx[r] = cstart + diagIdx[r];
2544     } else if (PetscRealPart(diagA[r]) == PetscRealPart(offdiagA[r])) {
2545       a[r] = diagA[r];
2546       if (idx) {
2547         if (cstart + diagIdx[r] <= offdiagIdx[r]) {
2548           idx[r] = cstart + diagIdx[r];
2549         } else idx[r] = offdiagIdx[r];
2550       }
2551     } else {
2552       a[r] = offdiagA[r];
2553       if (idx) idx[r] = offdiagIdx[r];
2554     }
2555   }
2556   PetscCall(MatSeqAIJRestoreArrayRead(B, &bav));
2557   PetscCall(VecRestoreArrayWrite(v, &a));
2558   PetscCall(VecRestoreArrayRead(diagV, (const PetscScalar **)&diagA));
2559   PetscCall(VecRestoreArrayWrite(offdiagV, &offdiagA));
2560   PetscCall(VecDestroy(&diagV));
2561   PetscCall(VecDestroy(&offdiagV));
2562   PetscCall(PetscFree2(diagIdx, offdiagIdx));
2563   PetscFunctionReturn(PETSC_SUCCESS);
2564 }
2565 
2566 PetscErrorCode MatGetSeqNonzeroStructure_MPIAIJ(Mat mat, Mat *newmat)
2567 {
2568   Mat *dummy;
2569 
2570   PetscFunctionBegin;
2571   PetscCall(MatCreateSubMatrix_MPIAIJ_All(mat, MAT_DO_NOT_GET_VALUES, MAT_INITIAL_MATRIX, &dummy));
2572   *newmat = *dummy;
2573   PetscCall(PetscFree(dummy));
2574   PetscFunctionReturn(PETSC_SUCCESS);
2575 }
2576 
2577 static PetscErrorCode MatInvertBlockDiagonal_MPIAIJ(Mat A, const PetscScalar **values)
2578 {
2579   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2580 
2581   PetscFunctionBegin;
2582   PetscCall(MatInvertBlockDiagonal(a->A, values));
2583   A->factorerrortype = a->A->factorerrortype;
2584   PetscFunctionReturn(PETSC_SUCCESS);
2585 }
2586 
2587 static PetscErrorCode MatSetRandom_MPIAIJ(Mat x, PetscRandom rctx)
2588 {
2589   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)x->data;
2590 
2591   PetscFunctionBegin;
2592   PetscCheck(x->assembled || x->preallocated, PetscObjectComm((PetscObject)x), PETSC_ERR_ARG_WRONGSTATE, "MatSetRandom on an unassembled and unpreallocated MATMPIAIJ is not allowed");
2593   PetscCall(MatSetRandom(aij->A, rctx));
2594   if (x->assembled) {
2595     PetscCall(MatSetRandom(aij->B, rctx));
2596   } else {
2597     PetscCall(MatSetRandomSkipColumnRange_SeqAIJ_Private(aij->B, x->cmap->rstart, x->cmap->rend, rctx));
2598   }
2599   PetscCall(MatAssemblyBegin(x, MAT_FINAL_ASSEMBLY));
2600   PetscCall(MatAssemblyEnd(x, MAT_FINAL_ASSEMBLY));
2601   PetscFunctionReturn(PETSC_SUCCESS);
2602 }
2603 
2604 static PetscErrorCode MatMPIAIJSetUseScalableIncreaseOverlap_MPIAIJ(Mat A, PetscBool sc)
2605 {
2606   PetscFunctionBegin;
2607   if (sc) A->ops->increaseoverlap = MatIncreaseOverlap_MPIAIJ_Scalable;
2608   else A->ops->increaseoverlap = MatIncreaseOverlap_MPIAIJ;
2609   PetscFunctionReturn(PETSC_SUCCESS);
2610 }
2611 
2612 /*@
2613   MatMPIAIJGetNumberNonzeros - gets the number of nonzeros in the matrix on this MPI rank
2614 
2615   Not Collective
2616 
2617   Input Parameter:
2618 . A - the matrix
2619 
2620   Output Parameter:
2621 . nz - the number of nonzeros
2622 
2623   Level: advanced
2624 
2625 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`
2626 @*/
2627 PetscErrorCode MatMPIAIJGetNumberNonzeros(Mat A, PetscCount *nz)
2628 {
2629   Mat_MPIAIJ *maij = (Mat_MPIAIJ *)A->data;
2630   Mat_SeqAIJ *aaij = (Mat_SeqAIJ *)maij->A->data, *baij = (Mat_SeqAIJ *)maij->B->data;
2631   PetscBool   isaij;
2632 
2633   PetscFunctionBegin;
2634   PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATMPIAIJ, &isaij));
2635   PetscCheck(isaij, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Not for type %s", ((PetscObject)A)->type_name);
2636   *nz = aaij->i[A->rmap->n] + baij->i[A->rmap->n];
2637   PetscFunctionReturn(PETSC_SUCCESS);
2638 }
2639 
2640 /*@
2641   MatMPIAIJSetUseScalableIncreaseOverlap - Determine if the matrix uses a scalable algorithm to compute the overlap
2642 
2643   Collective
2644 
2645   Input Parameters:
2646 + A  - the matrix
2647 - sc - `PETSC_TRUE` indicates use the scalable algorithm (default is not to use the scalable algorithm)
2648 
2649   Level: advanced
2650 
2651 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`
2652 @*/
2653 PetscErrorCode MatMPIAIJSetUseScalableIncreaseOverlap(Mat A, PetscBool sc)
2654 {
2655   PetscFunctionBegin;
2656   PetscTryMethod(A, "MatMPIAIJSetUseScalableIncreaseOverlap_C", (Mat, PetscBool), (A, sc));
2657   PetscFunctionReturn(PETSC_SUCCESS);
2658 }
2659 
2660 PetscErrorCode MatSetFromOptions_MPIAIJ(Mat A, PetscOptionItems PetscOptionsObject)
2661 {
2662   PetscBool sc = PETSC_FALSE, flg;
2663 
2664   PetscFunctionBegin;
2665   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJ options");
2666   if (A->ops->increaseoverlap == MatIncreaseOverlap_MPIAIJ_Scalable) sc = PETSC_TRUE;
2667   PetscCall(PetscOptionsBool("-mat_increase_overlap_scalable", "Use a scalable algorithm to compute the overlap", "MatIncreaseOverlap", sc, &sc, &flg));
2668   if (flg) PetscCall(MatMPIAIJSetUseScalableIncreaseOverlap(A, sc));
2669   PetscOptionsHeadEnd();
2670   PetscFunctionReturn(PETSC_SUCCESS);
2671 }
2672 
2673 static PetscErrorCode MatShift_MPIAIJ(Mat Y, PetscScalar a)
2674 {
2675   Mat_MPIAIJ *maij = (Mat_MPIAIJ *)Y->data;
2676   Mat_SeqAIJ *aij  = (Mat_SeqAIJ *)maij->A->data;
2677 
2678   PetscFunctionBegin;
2679   if (!Y->preallocated) {
2680     PetscCall(MatMPIAIJSetPreallocation(Y, 1, NULL, 0, NULL));
2681   } else if (!aij->nz) { /* It does not matter if diagonals of Y only partially lie in maij->A. We just need an estimated preallocation. */
2682     PetscInt nonew = aij->nonew;
2683     PetscCall(MatSeqAIJSetPreallocation(maij->A, 1, NULL));
2684     aij->nonew = nonew;
2685   }
2686   PetscCall(MatShift_Basic(Y, a));
2687   PetscFunctionReturn(PETSC_SUCCESS);
2688 }
2689 
2690 static PetscErrorCode MatMissingDiagonal_MPIAIJ(Mat A, PetscBool *missing, PetscInt *d)
2691 {
2692   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2693 
2694   PetscFunctionBegin;
2695   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only works for square matrices");
2696   PetscCall(MatMissingDiagonal(a->A, missing, d));
2697   if (d) {
2698     PetscInt rstart;
2699     PetscCall(MatGetOwnershipRange(A, &rstart, NULL));
2700     *d += rstart;
2701   }
2702   PetscFunctionReturn(PETSC_SUCCESS);
2703 }
2704 
2705 static PetscErrorCode MatInvertVariableBlockDiagonal_MPIAIJ(Mat A, PetscInt nblocks, const PetscInt *bsizes, PetscScalar *diag)
2706 {
2707   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2708 
2709   PetscFunctionBegin;
2710   PetscCall(MatInvertVariableBlockDiagonal(a->A, nblocks, bsizes, diag));
2711   PetscFunctionReturn(PETSC_SUCCESS);
2712 }
2713 
2714 static PetscErrorCode MatEliminateZeros_MPIAIJ(Mat A, PetscBool keep)
2715 {
2716   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
2717 
2718   PetscFunctionBegin;
2719   PetscCall(MatEliminateZeros_SeqAIJ(a->A, keep));        // possibly keep zero diagonal coefficients
2720   PetscCall(MatEliminateZeros_SeqAIJ(a->B, PETSC_FALSE)); // never keep zero diagonal coefficients
2721   PetscFunctionReturn(PETSC_SUCCESS);
2722 }
2723 
2724 static struct _MatOps MatOps_Values = {MatSetValues_MPIAIJ,
2725                                        MatGetRow_MPIAIJ,
2726                                        MatRestoreRow_MPIAIJ,
2727                                        MatMult_MPIAIJ,
2728                                        /* 4*/ MatMultAdd_MPIAIJ,
2729                                        MatMultTranspose_MPIAIJ,
2730                                        MatMultTransposeAdd_MPIAIJ,
2731                                        NULL,
2732                                        NULL,
2733                                        NULL,
2734                                        /*10*/ NULL,
2735                                        NULL,
2736                                        NULL,
2737                                        MatSOR_MPIAIJ,
2738                                        MatTranspose_MPIAIJ,
2739                                        /*15*/ MatGetInfo_MPIAIJ,
2740                                        MatEqual_MPIAIJ,
2741                                        MatGetDiagonal_MPIAIJ,
2742                                        MatDiagonalScale_MPIAIJ,
2743                                        MatNorm_MPIAIJ,
2744                                        /*20*/ MatAssemblyBegin_MPIAIJ,
2745                                        MatAssemblyEnd_MPIAIJ,
2746                                        MatSetOption_MPIAIJ,
2747                                        MatZeroEntries_MPIAIJ,
2748                                        /*24*/ MatZeroRows_MPIAIJ,
2749                                        NULL,
2750                                        NULL,
2751                                        NULL,
2752                                        NULL,
2753                                        /*29*/ MatSetUp_MPI_Hash,
2754                                        NULL,
2755                                        NULL,
2756                                        MatGetDiagonalBlock_MPIAIJ,
2757                                        NULL,
2758                                        /*34*/ MatDuplicate_MPIAIJ,
2759                                        NULL,
2760                                        NULL,
2761                                        NULL,
2762                                        NULL,
2763                                        /*39*/ MatAXPY_MPIAIJ,
2764                                        MatCreateSubMatrices_MPIAIJ,
2765                                        MatIncreaseOverlap_MPIAIJ,
2766                                        MatGetValues_MPIAIJ,
2767                                        MatCopy_MPIAIJ,
2768                                        /*44*/ MatGetRowMax_MPIAIJ,
2769                                        MatScale_MPIAIJ,
2770                                        MatShift_MPIAIJ,
2771                                        MatDiagonalSet_MPIAIJ,
2772                                        MatZeroRowsColumns_MPIAIJ,
2773                                        /*49*/ MatSetRandom_MPIAIJ,
2774                                        MatGetRowIJ_MPIAIJ,
2775                                        MatRestoreRowIJ_MPIAIJ,
2776                                        NULL,
2777                                        NULL,
2778                                        /*54*/ MatFDColoringCreate_MPIXAIJ,
2779                                        NULL,
2780                                        MatSetUnfactored_MPIAIJ,
2781                                        MatPermute_MPIAIJ,
2782                                        NULL,
2783                                        /*59*/ MatCreateSubMatrix_MPIAIJ,
2784                                        MatDestroy_MPIAIJ,
2785                                        MatView_MPIAIJ,
2786                                        NULL,
2787                                        NULL,
2788                                        /*64*/ MatMatMatMultNumeric_MPIAIJ_MPIAIJ_MPIAIJ,
2789                                        NULL,
2790                                        NULL,
2791                                        NULL,
2792                                        MatGetRowMaxAbs_MPIAIJ,
2793                                        /*69*/ MatGetRowMinAbs_MPIAIJ,
2794                                        NULL,
2795                                        NULL,
2796                                        MatFDColoringApply_AIJ,
2797                                        MatSetFromOptions_MPIAIJ,
2798                                        MatFindZeroDiagonals_MPIAIJ,
2799                                        /*75*/ NULL,
2800                                        NULL,
2801                                        NULL,
2802                                        MatLoad_MPIAIJ,
2803                                        NULL,
2804                                        /*80*/ NULL,
2805                                        NULL,
2806                                        NULL,
2807                                        /*83*/ NULL,
2808                                        NULL,
2809                                        MatMatMultNumeric_MPIAIJ_MPIAIJ,
2810                                        MatPtAPNumeric_MPIAIJ_MPIAIJ,
2811                                        NULL,
2812                                        NULL,
2813                                        /*89*/ MatBindToCPU_MPIAIJ,
2814                                        MatProductSetFromOptions_MPIAIJ,
2815                                        NULL,
2816                                        NULL,
2817                                        MatConjugate_MPIAIJ,
2818                                        /*94*/ NULL,
2819                                        MatSetValuesRow_MPIAIJ,
2820                                        MatRealPart_MPIAIJ,
2821                                        MatImaginaryPart_MPIAIJ,
2822                                        NULL,
2823                                        /*99*/ NULL,
2824                                        NULL,
2825                                        NULL,
2826                                        MatGetRowMin_MPIAIJ,
2827                                        NULL,
2828                                        /*104*/ MatMissingDiagonal_MPIAIJ,
2829                                        MatGetSeqNonzeroStructure_MPIAIJ,
2830                                        NULL,
2831                                        MatGetGhosts_MPIAIJ,
2832                                        NULL,
2833                                        /*109*/ NULL,
2834                                        MatMultDiagonalBlock_MPIAIJ,
2835                                        NULL,
2836                                        NULL,
2837                                        NULL,
2838                                        /*114*/ MatGetMultiProcBlock_MPIAIJ,
2839                                        MatFindNonzeroRows_MPIAIJ,
2840                                        MatGetColumnReductions_MPIAIJ,
2841                                        MatInvertBlockDiagonal_MPIAIJ,
2842                                        MatInvertVariableBlockDiagonal_MPIAIJ,
2843                                        /*119*/ MatCreateSubMatricesMPI_MPIAIJ,
2844                                        NULL,
2845                                        NULL,
2846                                        MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ,
2847                                        NULL,
2848                                        /*124*/ NULL,
2849                                        NULL,
2850                                        NULL,
2851                                        MatSetBlockSizes_MPIAIJ,
2852                                        NULL,
2853                                        /*129*/ MatFDColoringSetUp_MPIXAIJ,
2854                                        MatFindOffBlockDiagonalEntries_MPIAIJ,
2855                                        MatCreateMPIMatConcatenateSeqMat_MPIAIJ,
2856                                        NULL,
2857                                        NULL,
2858                                        /*134*/ NULL,
2859                                        MatCreateGraph_Simple_AIJ,
2860                                        NULL,
2861                                        MatEliminateZeros_MPIAIJ,
2862                                        MatGetRowSumAbs_MPIAIJ,
2863                                        /*139*/ NULL,
2864                                        NULL,
2865                                        NULL,
2866                                        MatCopyHashToXAIJ_MPI_Hash};
2867 
2868 static PetscErrorCode MatStoreValues_MPIAIJ(Mat mat)
2869 {
2870   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2871 
2872   PetscFunctionBegin;
2873   PetscCall(MatStoreValues(aij->A));
2874   PetscCall(MatStoreValues(aij->B));
2875   PetscFunctionReturn(PETSC_SUCCESS);
2876 }
2877 
2878 static PetscErrorCode MatRetrieveValues_MPIAIJ(Mat mat)
2879 {
2880   Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data;
2881 
2882   PetscFunctionBegin;
2883   PetscCall(MatRetrieveValues(aij->A));
2884   PetscCall(MatRetrieveValues(aij->B));
2885   PetscFunctionReturn(PETSC_SUCCESS);
2886 }
2887 
2888 PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJ(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
2889 {
2890   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2891   PetscMPIInt size;
2892 
2893   PetscFunctionBegin;
2894   if (B->hash_active) {
2895     B->ops[0]      = b->cops;
2896     B->hash_active = PETSC_FALSE;
2897   }
2898   PetscCall(PetscLayoutSetUp(B->rmap));
2899   PetscCall(PetscLayoutSetUp(B->cmap));
2900 
2901 #if defined(PETSC_USE_CTABLE)
2902   PetscCall(PetscHMapIDestroy(&b->colmap));
2903 #else
2904   PetscCall(PetscFree(b->colmap));
2905 #endif
2906   PetscCall(PetscFree(b->garray));
2907   PetscCall(VecDestroy(&b->lvec));
2908   PetscCall(VecScatterDestroy(&b->Mvctx));
2909 
2910   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2911 
2912   MatSeqXAIJGetOptions_Private(b->B);
2913   PetscCall(MatDestroy(&b->B));
2914   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
2915   PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
2916   PetscCall(MatSetBlockSizesFromMats(b->B, B, B));
2917   PetscCall(MatSetType(b->B, MATSEQAIJ));
2918   MatSeqXAIJRestoreOptions_Private(b->B);
2919 
2920   MatSeqXAIJGetOptions_Private(b->A);
2921   PetscCall(MatDestroy(&b->A));
2922   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
2923   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
2924   PetscCall(MatSetBlockSizesFromMats(b->A, B, B));
2925   PetscCall(MatSetType(b->A, MATSEQAIJ));
2926   MatSeqXAIJRestoreOptions_Private(b->A);
2927 
2928   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
2929   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
2930   B->preallocated  = PETSC_TRUE;
2931   B->was_assembled = PETSC_FALSE;
2932   B->assembled     = PETSC_FALSE;
2933   PetscFunctionReturn(PETSC_SUCCESS);
2934 }
2935 
2936 static PetscErrorCode MatResetPreallocation_MPIAIJ(Mat B)
2937 {
2938   Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
2939   PetscBool   ondiagreset, offdiagreset, memoryreset;
2940 
2941   PetscFunctionBegin;
2942   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
2943   PetscCheck(B->insertmode == NOT_SET_VALUES, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot reset preallocation after setting some values but not yet calling MatAssemblyBegin()/MatAssemblyEnd()");
2944   if (B->num_ass == 0) PetscFunctionReturn(PETSC_SUCCESS);
2945 
2946   PetscCall(MatResetPreallocation_SeqAIJ_Private(b->A, &ondiagreset));
2947   PetscCall(MatResetPreallocation_SeqAIJ_Private(b->B, &offdiagreset));
2948   memoryreset = (PetscBool)(ondiagreset || offdiagreset);
2949   PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &memoryreset, 1, MPIU_BOOL, MPI_LOR, PetscObjectComm((PetscObject)B)));
2950   if (!memoryreset) PetscFunctionReturn(PETSC_SUCCESS);
2951 
2952   PetscCall(PetscLayoutSetUp(B->rmap));
2953   PetscCall(PetscLayoutSetUp(B->cmap));
2954   PetscCheck(B->assembled || B->was_assembled, PetscObjectComm((PetscObject)B), PETSC_ERR_ARG_WRONGSTATE, "Should not need to reset preallocation if the matrix was never assembled");
2955   PetscCall(MatDisAssemble_MPIAIJ(B, PETSC_TRUE));
2956   PetscCall(VecScatterDestroy(&b->Mvctx));
2957 
2958   B->preallocated  = PETSC_TRUE;
2959   B->was_assembled = PETSC_FALSE;
2960   B->assembled     = PETSC_FALSE;
2961   /* Log that the state of this object has changed; this will help guarantee that preconditioners get re-setup */
2962   PetscCall(PetscObjectStateIncrease((PetscObject)B));
2963   PetscFunctionReturn(PETSC_SUCCESS);
2964 }
2965 
2966 PetscErrorCode MatDuplicate_MPIAIJ(Mat matin, MatDuplicateOption cpvalues, Mat *newmat)
2967 {
2968   Mat         mat;
2969   Mat_MPIAIJ *a, *oldmat = (Mat_MPIAIJ *)matin->data;
2970 
2971   PetscFunctionBegin;
2972   *newmat = NULL;
2973   PetscCall(MatCreate(PetscObjectComm((PetscObject)matin), &mat));
2974   PetscCall(MatSetSizes(mat, matin->rmap->n, matin->cmap->n, matin->rmap->N, matin->cmap->N));
2975   PetscCall(MatSetBlockSizesFromMats(mat, matin, matin));
2976   PetscCall(MatSetType(mat, ((PetscObject)matin)->type_name));
2977   a = (Mat_MPIAIJ *)mat->data;
2978 
2979   mat->factortype = matin->factortype;
2980   mat->assembled  = matin->assembled;
2981   mat->insertmode = NOT_SET_VALUES;
2982 
2983   a->size         = oldmat->size;
2984   a->rank         = oldmat->rank;
2985   a->donotstash   = oldmat->donotstash;
2986   a->roworiented  = oldmat->roworiented;
2987   a->rowindices   = NULL;
2988   a->rowvalues    = NULL;
2989   a->getrowactive = PETSC_FALSE;
2990 
2991   PetscCall(PetscLayoutReference(matin->rmap, &mat->rmap));
2992   PetscCall(PetscLayoutReference(matin->cmap, &mat->cmap));
2993   if (matin->hash_active) {
2994     PetscCall(MatSetUp(mat));
2995   } else {
2996     mat->preallocated = matin->preallocated;
2997     if (oldmat->colmap) {
2998 #if defined(PETSC_USE_CTABLE)
2999       PetscCall(PetscHMapIDuplicate(oldmat->colmap, &a->colmap));
3000 #else
3001       PetscCall(PetscMalloc1(mat->cmap->N, &a->colmap));
3002       PetscCall(PetscArraycpy(a->colmap, oldmat->colmap, mat->cmap->N));
3003 #endif
3004     } else a->colmap = NULL;
3005     if (oldmat->garray) {
3006       PetscInt len;
3007       len = oldmat->B->cmap->n;
3008       PetscCall(PetscMalloc1(len + 1, &a->garray));
3009       if (len) PetscCall(PetscArraycpy(a->garray, oldmat->garray, len));
3010     } else a->garray = NULL;
3011 
3012     /* It may happen MatDuplicate is called with a non-assembled matrix
3013       In fact, MatDuplicate only requires the matrix to be preallocated
3014       This may happen inside a DMCreateMatrix_Shell */
3015     if (oldmat->lvec) PetscCall(VecDuplicate(oldmat->lvec, &a->lvec));
3016     if (oldmat->Mvctx) {
3017       a->Mvctx = oldmat->Mvctx;
3018       PetscCall(PetscObjectReference((PetscObject)oldmat->Mvctx));
3019     }
3020     PetscCall(MatDuplicate(oldmat->A, cpvalues, &a->A));
3021     PetscCall(MatDuplicate(oldmat->B, cpvalues, &a->B));
3022   }
3023   PetscCall(PetscFunctionListDuplicate(((PetscObject)matin)->qlist, &((PetscObject)mat)->qlist));
3024   *newmat = mat;
3025   PetscFunctionReturn(PETSC_SUCCESS);
3026 }
3027 
3028 PetscErrorCode MatLoad_MPIAIJ(Mat newMat, PetscViewer viewer)
3029 {
3030   PetscBool isbinary, ishdf5;
3031 
3032   PetscFunctionBegin;
3033   PetscValidHeaderSpecific(newMat, MAT_CLASSID, 1);
3034   PetscValidHeaderSpecific(viewer, PETSC_VIEWER_CLASSID, 2);
3035   /* force binary viewer to load .info file if it has not yet done so */
3036   PetscCall(PetscViewerSetUp(viewer));
3037   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
3038   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERHDF5, &ishdf5));
3039   if (isbinary) {
3040     PetscCall(MatLoad_MPIAIJ_Binary(newMat, viewer));
3041   } else if (ishdf5) {
3042 #if defined(PETSC_HAVE_HDF5)
3043     PetscCall(MatLoad_AIJ_HDF5(newMat, viewer));
3044 #else
3045     SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "HDF5 not supported in this build.\nPlease reconfigure using --download-hdf5");
3046 #endif
3047   } else {
3048     SETERRQ(PetscObjectComm((PetscObject)newMat), PETSC_ERR_SUP, "Viewer type %s not yet supported for reading %s matrices", ((PetscObject)viewer)->type_name, ((PetscObject)newMat)->type_name);
3049   }
3050   PetscFunctionReturn(PETSC_SUCCESS);
3051 }
3052 
3053 PetscErrorCode MatLoad_MPIAIJ_Binary(Mat mat, PetscViewer viewer)
3054 {
3055   PetscInt     header[4], M, N, m, nz, rows, cols, sum, i;
3056   PetscInt    *rowidxs, *colidxs;
3057   PetscScalar *matvals;
3058 
3059   PetscFunctionBegin;
3060   PetscCall(PetscViewerSetUp(viewer));
3061 
3062   /* read in matrix header */
3063   PetscCall(PetscViewerBinaryRead(viewer, header, 4, NULL, PETSC_INT));
3064   PetscCheck(header[0] == MAT_FILE_CLASSID, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Not a matrix object in file");
3065   M  = header[1];
3066   N  = header[2];
3067   nz = header[3];
3068   PetscCheck(M >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix row size (%" PetscInt_FMT ") in file is negative", M);
3069   PetscCheck(N >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix column size (%" PetscInt_FMT ") in file is negative", N);
3070   PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix stored in special format on disk, cannot load as MPIAIJ");
3071 
3072   /* set block sizes from the viewer's .info file */
3073   PetscCall(MatLoad_Binary_BlockSizes(mat, viewer));
3074   /* set global sizes if not set already */
3075   if (mat->rmap->N < 0) mat->rmap->N = M;
3076   if (mat->cmap->N < 0) mat->cmap->N = N;
3077   PetscCall(PetscLayoutSetUp(mat->rmap));
3078   PetscCall(PetscLayoutSetUp(mat->cmap));
3079 
3080   /* check if the matrix sizes are correct */
3081   PetscCall(MatGetSize(mat, &rows, &cols));
3082   PetscCheck(M == rows && N == cols, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix in file of different sizes (%" PetscInt_FMT ", %" PetscInt_FMT ") than the input matrix (%" PetscInt_FMT ", %" PetscInt_FMT ")", M, N, rows, cols);
3083 
3084   /* read in row lengths and build row indices */
3085   PetscCall(MatGetLocalSize(mat, &m, NULL));
3086   PetscCall(PetscMalloc1(m + 1, &rowidxs));
3087   PetscCall(PetscViewerBinaryReadAll(viewer, rowidxs + 1, m, PETSC_DECIDE, M, PETSC_INT));
3088   rowidxs[0] = 0;
3089   for (i = 0; i < m; i++) rowidxs[i + 1] += rowidxs[i];
3090   if (nz != PETSC_INT_MAX) {
3091     PetscCallMPI(MPIU_Allreduce(&rowidxs[m], &sum, 1, MPIU_INT, MPI_SUM, PetscObjectComm((PetscObject)viewer)));
3092     PetscCheck(sum == nz, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Inconsistent matrix data in file: nonzeros = %" PetscInt_FMT ", sum-row-lengths = %" PetscInt_FMT, nz, sum);
3093   }
3094 
3095   /* read in column indices and matrix values */
3096   PetscCall(PetscMalloc2(rowidxs[m], &colidxs, rowidxs[m], &matvals));
3097   PetscCall(PetscViewerBinaryReadAll(viewer, colidxs, rowidxs[m], PETSC_DETERMINE, PETSC_DETERMINE, PETSC_INT));
3098   PetscCall(PetscViewerBinaryReadAll(viewer, matvals, rowidxs[m], PETSC_DETERMINE, PETSC_DETERMINE, PETSC_SCALAR));
3099   /* store matrix indices and values */
3100   PetscCall(MatMPIAIJSetPreallocationCSR(mat, rowidxs, colidxs, matvals));
3101   PetscCall(PetscFree(rowidxs));
3102   PetscCall(PetscFree2(colidxs, matvals));
3103   PetscFunctionReturn(PETSC_SUCCESS);
3104 }
3105 
3106 /* Not scalable because of ISAllGather() unless getting all columns. */
3107 static PetscErrorCode ISGetSeqIS_Private(Mat mat, IS iscol, IS *isseq)
3108 {
3109   IS          iscol_local;
3110   PetscBool   isstride;
3111   PetscMPIInt gisstride = 0;
3112 
3113   PetscFunctionBegin;
3114   /* check if we are grabbing all columns*/
3115   PetscCall(PetscObjectTypeCompare((PetscObject)iscol, ISSTRIDE, &isstride));
3116 
3117   if (isstride) {
3118     PetscInt start, len, mstart, mlen;
3119     PetscCall(ISStrideGetInfo(iscol, &start, NULL));
3120     PetscCall(ISGetLocalSize(iscol, &len));
3121     PetscCall(MatGetOwnershipRangeColumn(mat, &mstart, &mlen));
3122     if (mstart == start && mlen - mstart == len) gisstride = 1;
3123   }
3124 
3125   PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &gisstride, 1, MPI_INT, MPI_MIN, PetscObjectComm((PetscObject)mat)));
3126   if (gisstride) {
3127     PetscInt N;
3128     PetscCall(MatGetSize(mat, NULL, &N));
3129     PetscCall(ISCreateStride(PETSC_COMM_SELF, N, 0, 1, &iscol_local));
3130     PetscCall(ISSetIdentity(iscol_local));
3131     PetscCall(PetscInfo(mat, "Optimizing for obtaining all columns of the matrix; skipping ISAllGather()\n"));
3132   } else {
3133     PetscInt cbs;
3134     PetscCall(ISGetBlockSize(iscol, &cbs));
3135     PetscCall(ISAllGather(iscol, &iscol_local));
3136     PetscCall(ISSetBlockSize(iscol_local, cbs));
3137   }
3138 
3139   *isseq = iscol_local;
3140   PetscFunctionReturn(PETSC_SUCCESS);
3141 }
3142 
3143 /*
3144  Used by MatCreateSubMatrix_MPIAIJ_SameRowColDist() to avoid ISAllGather() and global size of iscol_local
3145  (see MatCreateSubMatrix_MPIAIJ_nonscalable)
3146 
3147  Input Parameters:
3148 +   mat - matrix
3149 .   isrow - parallel row index set; its local indices are a subset of local columns of `mat`,
3150            i.e., mat->rstart <= isrow[i] < mat->rend
3151 -   iscol - parallel column index set; its local indices are a subset of local columns of `mat`,
3152            i.e., mat->cstart <= iscol[i] < mat->cend
3153 
3154  Output Parameters:
3155 +   isrow_d - sequential row index set for retrieving mat->A
3156 .   iscol_d - sequential  column index set for retrieving mat->A
3157 .   iscol_o - sequential column index set for retrieving mat->B
3158 -   garray - column map; garray[i] indicates global location of iscol_o[i] in `iscol`
3159  */
3160 static PetscErrorCode ISGetSeqIS_SameColDist_Private(Mat mat, IS isrow, IS iscol, IS *isrow_d, IS *iscol_d, IS *iscol_o, PetscInt *garray[])
3161 {
3162   Vec             x, cmap;
3163   const PetscInt *is_idx;
3164   PetscScalar    *xarray, *cmaparray;
3165   PetscInt        ncols, isstart, *idx, m, rstart, *cmap1, count;
3166   Mat_MPIAIJ     *a    = (Mat_MPIAIJ *)mat->data;
3167   Mat             B    = a->B;
3168   Vec             lvec = a->lvec, lcmap;
3169   PetscInt        i, cstart, cend, Bn = B->cmap->N;
3170   MPI_Comm        comm;
3171   VecScatter      Mvctx = a->Mvctx;
3172 
3173   PetscFunctionBegin;
3174   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3175   PetscCall(ISGetLocalSize(iscol, &ncols));
3176 
3177   /* (1) iscol is a sub-column vector of mat, pad it with '-1.' to form a full vector x */
3178   PetscCall(MatCreateVecs(mat, &x, NULL));
3179   PetscCall(VecSet(x, -1.0));
3180   PetscCall(VecDuplicate(x, &cmap));
3181   PetscCall(VecSet(cmap, -1.0));
3182 
3183   /* Get start indices */
3184   PetscCallMPI(MPI_Scan(&ncols, &isstart, 1, MPIU_INT, MPI_SUM, comm));
3185   isstart -= ncols;
3186   PetscCall(MatGetOwnershipRangeColumn(mat, &cstart, &cend));
3187 
3188   PetscCall(ISGetIndices(iscol, &is_idx));
3189   PetscCall(VecGetArray(x, &xarray));
3190   PetscCall(VecGetArray(cmap, &cmaparray));
3191   PetscCall(PetscMalloc1(ncols, &idx));
3192   for (i = 0; i < ncols; i++) {
3193     xarray[is_idx[i] - cstart]    = (PetscScalar)is_idx[i];
3194     cmaparray[is_idx[i] - cstart] = i + isstart;        /* global index of iscol[i] */
3195     idx[i]                        = is_idx[i] - cstart; /* local index of iscol[i]  */
3196   }
3197   PetscCall(VecRestoreArray(x, &xarray));
3198   PetscCall(VecRestoreArray(cmap, &cmaparray));
3199   PetscCall(ISRestoreIndices(iscol, &is_idx));
3200 
3201   /* Get iscol_d */
3202   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, iscol_d));
3203   PetscCall(ISGetBlockSize(iscol, &i));
3204   PetscCall(ISSetBlockSize(*iscol_d, i));
3205 
3206   /* Get isrow_d */
3207   PetscCall(ISGetLocalSize(isrow, &m));
3208   rstart = mat->rmap->rstart;
3209   PetscCall(PetscMalloc1(m, &idx));
3210   PetscCall(ISGetIndices(isrow, &is_idx));
3211   for (i = 0; i < m; i++) idx[i] = is_idx[i] - rstart;
3212   PetscCall(ISRestoreIndices(isrow, &is_idx));
3213 
3214   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, m, idx, PETSC_OWN_POINTER, isrow_d));
3215   PetscCall(ISGetBlockSize(isrow, &i));
3216   PetscCall(ISSetBlockSize(*isrow_d, i));
3217 
3218   /* (2) Scatter x and cmap using aij->Mvctx to get their off-process portions (see MatMult_MPIAIJ) */
3219   PetscCall(VecScatterBegin(Mvctx, x, lvec, INSERT_VALUES, SCATTER_FORWARD));
3220   PetscCall(VecScatterEnd(Mvctx, x, lvec, INSERT_VALUES, SCATTER_FORWARD));
3221 
3222   PetscCall(VecDuplicate(lvec, &lcmap));
3223 
3224   PetscCall(VecScatterBegin(Mvctx, cmap, lcmap, INSERT_VALUES, SCATTER_FORWARD));
3225   PetscCall(VecScatterEnd(Mvctx, cmap, lcmap, INSERT_VALUES, SCATTER_FORWARD));
3226 
3227   /* (3) create sequential iscol_o (a subset of iscol) and isgarray */
3228   /* off-process column indices */
3229   count = 0;
3230   PetscCall(PetscMalloc1(Bn, &idx));
3231   PetscCall(PetscMalloc1(Bn, &cmap1));
3232 
3233   PetscCall(VecGetArray(lvec, &xarray));
3234   PetscCall(VecGetArray(lcmap, &cmaparray));
3235   for (i = 0; i < Bn; i++) {
3236     if (PetscRealPart(xarray[i]) > -1.0) {
3237       idx[count]   = i;                                     /* local column index in off-diagonal part B */
3238       cmap1[count] = (PetscInt)PetscRealPart(cmaparray[i]); /* column index in submat */
3239       count++;
3240     }
3241   }
3242   PetscCall(VecRestoreArray(lvec, &xarray));
3243   PetscCall(VecRestoreArray(lcmap, &cmaparray));
3244 
3245   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, idx, PETSC_COPY_VALUES, iscol_o));
3246   /* cannot ensure iscol_o has same blocksize as iscol! */
3247 
3248   PetscCall(PetscFree(idx));
3249   *garray = cmap1;
3250 
3251   PetscCall(VecDestroy(&x));
3252   PetscCall(VecDestroy(&cmap));
3253   PetscCall(VecDestroy(&lcmap));
3254   PetscFunctionReturn(PETSC_SUCCESS);
3255 }
3256 
3257 /* isrow and iscol have same processor distribution as mat, output *submat is a submatrix of local mat */
3258 PetscErrorCode MatCreateSubMatrix_MPIAIJ_SameRowColDist(Mat mat, IS isrow, IS iscol, MatReuse call, Mat *submat)
3259 {
3260   Mat_MPIAIJ *a = (Mat_MPIAIJ *)mat->data, *asub;
3261   Mat         M = NULL;
3262   MPI_Comm    comm;
3263   IS          iscol_d, isrow_d, iscol_o;
3264   Mat         Asub = NULL, Bsub = NULL;
3265   PetscInt    n, count, M_size, N_size;
3266 
3267   PetscFunctionBegin;
3268   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3269 
3270   if (call == MAT_REUSE_MATRIX) {
3271     /* Retrieve isrow_d, iscol_d and iscol_o from submat */
3272     PetscCall(PetscObjectQuery((PetscObject)*submat, "isrow_d", (PetscObject *)&isrow_d));
3273     PetscCheck(isrow_d, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "isrow_d passed in was not used before, cannot reuse");
3274 
3275     PetscCall(PetscObjectQuery((PetscObject)*submat, "iscol_d", (PetscObject *)&iscol_d));
3276     PetscCheck(iscol_d, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_d passed in was not used before, cannot reuse");
3277 
3278     PetscCall(PetscObjectQuery((PetscObject)*submat, "iscol_o", (PetscObject *)&iscol_o));
3279     PetscCheck(iscol_o, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "iscol_o passed in was not used before, cannot reuse");
3280 
3281     /* Update diagonal and off-diagonal portions of submat */
3282     asub = (Mat_MPIAIJ *)(*submat)->data;
3283     PetscCall(MatCreateSubMatrix_SeqAIJ(a->A, isrow_d, iscol_d, PETSC_DECIDE, MAT_REUSE_MATRIX, &asub->A));
3284     PetscCall(ISGetLocalSize(iscol_o, &n));
3285     if (n) PetscCall(MatCreateSubMatrix_SeqAIJ(a->B, isrow_d, iscol_o, PETSC_DECIDE, MAT_REUSE_MATRIX, &asub->B));
3286     PetscCall(MatAssemblyBegin(*submat, MAT_FINAL_ASSEMBLY));
3287     PetscCall(MatAssemblyEnd(*submat, MAT_FINAL_ASSEMBLY));
3288 
3289   } else { /* call == MAT_INITIAL_MATRIX) */
3290     PetscInt *garray, *garray_compact;
3291     PetscInt  BsubN;
3292 
3293     /* Create isrow_d, iscol_d, iscol_o and isgarray (replace isgarray with array?) */
3294     PetscCall(ISGetSeqIS_SameColDist_Private(mat, isrow, iscol, &isrow_d, &iscol_d, &iscol_o, &garray));
3295 
3296     /* Create local submatrices Asub and Bsub */
3297     PetscCall(MatCreateSubMatrix_SeqAIJ(a->A, isrow_d, iscol_d, PETSC_DECIDE, MAT_INITIAL_MATRIX, &Asub));
3298     PetscCall(MatCreateSubMatrix_SeqAIJ(a->B, isrow_d, iscol_o, PETSC_DECIDE, MAT_INITIAL_MATRIX, &Bsub));
3299 
3300     // Compact garray so its not of size Bn
3301     PetscCall(ISGetSize(iscol_o, &count));
3302     PetscCall(PetscMalloc1(count, &garray_compact));
3303     PetscCall(PetscArraycpy(garray_compact, garray, count));
3304 
3305     /* Create submatrix M */
3306     PetscCall(ISGetSize(isrow, &M_size));
3307     PetscCall(ISGetSize(iscol, &N_size));
3308     PetscCall(MatCreateMPIAIJWithSeqAIJ(comm, M_size, N_size, Asub, Bsub, garray_compact, &M));
3309 
3310     /* If Bsub has empty columns, compress iscol_o such that it will retrieve condensed Bsub from a->B during reuse */
3311     asub = (Mat_MPIAIJ *)M->data;
3312 
3313     PetscCall(ISGetLocalSize(iscol_o, &BsubN));
3314     n = asub->B->cmap->N;
3315     if (BsubN > n) {
3316       /* This case can be tested using ~petsc/src/tao/bound/tutorials/runplate2_3 */
3317       const PetscInt *idx;
3318       PetscInt        i, j, *idx_new, *subgarray = asub->garray;
3319       PetscCall(PetscInfo(M, "submatrix Bn %" PetscInt_FMT " != BsubN %" PetscInt_FMT ", update iscol_o\n", n, BsubN));
3320 
3321       PetscCall(PetscMalloc1(n, &idx_new));
3322       j = 0;
3323       PetscCall(ISGetIndices(iscol_o, &idx));
3324       for (i = 0; i < n; i++) {
3325         if (j >= BsubN) break;
3326         while (subgarray[i] > garray[j]) j++;
3327 
3328         if (subgarray[i] == garray[j]) {
3329           idx_new[i] = idx[j++];
3330         } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "subgarray[%" PetscInt_FMT "]=%" PetscInt_FMT " cannot < garray[%" PetscInt_FMT "]=%" PetscInt_FMT, i, subgarray[i], j, garray[j]);
3331       }
3332       PetscCall(ISRestoreIndices(iscol_o, &idx));
3333 
3334       PetscCall(ISDestroy(&iscol_o));
3335       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, idx_new, PETSC_OWN_POINTER, &iscol_o));
3336 
3337     } else if (BsubN < n) {
3338       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Columns of Bsub (%" PetscInt_FMT ") cannot be smaller than B's (%" PetscInt_FMT ")", BsubN, asub->B->cmap->N);
3339     }
3340 
3341     PetscCall(PetscFree(garray));
3342     *submat = M;
3343 
3344     /* Save isrow_d, iscol_d and iscol_o used in processor for next request */
3345     PetscCall(PetscObjectCompose((PetscObject)M, "isrow_d", (PetscObject)isrow_d));
3346     PetscCall(ISDestroy(&isrow_d));
3347 
3348     PetscCall(PetscObjectCompose((PetscObject)M, "iscol_d", (PetscObject)iscol_d));
3349     PetscCall(ISDestroy(&iscol_d));
3350 
3351     PetscCall(PetscObjectCompose((PetscObject)M, "iscol_o", (PetscObject)iscol_o));
3352     PetscCall(ISDestroy(&iscol_o));
3353   }
3354   PetscFunctionReturn(PETSC_SUCCESS);
3355 }
3356 
3357 PetscErrorCode MatCreateSubMatrix_MPIAIJ(Mat mat, IS isrow, IS iscol, MatReuse call, Mat *newmat)
3358 {
3359   IS        iscol_local = NULL, isrow_d;
3360   PetscInt  csize;
3361   PetscInt  n, i, j, start, end;
3362   PetscBool sameRowDist = PETSC_FALSE, sameDist[2], tsameDist[2];
3363   MPI_Comm  comm;
3364 
3365   PetscFunctionBegin;
3366   /* If isrow has same processor distribution as mat,
3367      call MatCreateSubMatrix_MPIAIJ_SameRowDist() to avoid using a hash table with global size of iscol */
3368   if (call == MAT_REUSE_MATRIX) {
3369     PetscCall(PetscObjectQuery((PetscObject)*newmat, "isrow_d", (PetscObject *)&isrow_d));
3370     if (isrow_d) {
3371       sameRowDist  = PETSC_TRUE;
3372       tsameDist[1] = PETSC_TRUE; /* sameColDist */
3373     } else {
3374       PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_local));
3375       if (iscol_local) {
3376         sameRowDist  = PETSC_TRUE;
3377         tsameDist[1] = PETSC_FALSE; /* !sameColDist */
3378       }
3379     }
3380   } else {
3381     /* Check if isrow has same processor distribution as mat */
3382     sameDist[0] = PETSC_FALSE;
3383     PetscCall(ISGetLocalSize(isrow, &n));
3384     if (!n) {
3385       sameDist[0] = PETSC_TRUE;
3386     } else {
3387       PetscCall(ISGetMinMax(isrow, &i, &j));
3388       PetscCall(MatGetOwnershipRange(mat, &start, &end));
3389       if (i >= start && j < end) sameDist[0] = PETSC_TRUE;
3390     }
3391 
3392     /* Check if iscol has same processor distribution as mat */
3393     sameDist[1] = PETSC_FALSE;
3394     PetscCall(ISGetLocalSize(iscol, &n));
3395     if (!n) {
3396       sameDist[1] = PETSC_TRUE;
3397     } else {
3398       PetscCall(ISGetMinMax(iscol, &i, &j));
3399       PetscCall(MatGetOwnershipRangeColumn(mat, &start, &end));
3400       if (i >= start && j < end) sameDist[1] = PETSC_TRUE;
3401     }
3402 
3403     PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3404     PetscCallMPI(MPIU_Allreduce(&sameDist, &tsameDist, 2, MPIU_BOOL, MPI_LAND, comm));
3405     sameRowDist = tsameDist[0];
3406   }
3407 
3408   if (sameRowDist) {
3409     if (tsameDist[1]) { /* sameRowDist & sameColDist */
3410       /* isrow and iscol have same processor distribution as mat */
3411       PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowColDist(mat, isrow, iscol, call, newmat));
3412       PetscFunctionReturn(PETSC_SUCCESS);
3413     } else { /* sameRowDist */
3414       /* isrow has same processor distribution as mat */
3415       if (call == MAT_INITIAL_MATRIX) {
3416         PetscBool sorted;
3417         PetscCall(ISGetSeqIS_Private(mat, iscol, &iscol_local));
3418         PetscCall(ISGetLocalSize(iscol_local, &n)); /* local size of iscol_local = global columns of newmat */
3419         PetscCall(ISGetSize(iscol, &i));
3420         PetscCheck(n == i, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "n %" PetscInt_FMT " != size of iscol %" PetscInt_FMT, n, i);
3421 
3422         PetscCall(ISSorted(iscol_local, &sorted));
3423         if (sorted) {
3424           /* MatCreateSubMatrix_MPIAIJ_SameRowDist() requires iscol_local be sorted; it can have duplicate indices */
3425           PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowDist(mat, isrow, iscol, iscol_local, MAT_INITIAL_MATRIX, newmat));
3426           PetscFunctionReturn(PETSC_SUCCESS);
3427         }
3428       } else { /* call == MAT_REUSE_MATRIX */
3429         IS iscol_sub;
3430         PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_sub));
3431         if (iscol_sub) {
3432           PetscCall(MatCreateSubMatrix_MPIAIJ_SameRowDist(mat, isrow, iscol, NULL, call, newmat));
3433           PetscFunctionReturn(PETSC_SUCCESS);
3434         }
3435       }
3436     }
3437   }
3438 
3439   /* General case: iscol -> iscol_local which has global size of iscol */
3440   if (call == MAT_REUSE_MATRIX) {
3441     PetscCall(PetscObjectQuery((PetscObject)*newmat, "ISAllGather", (PetscObject *)&iscol_local));
3442     PetscCheck(iscol_local, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3443   } else {
3444     if (!iscol_local) PetscCall(ISGetSeqIS_Private(mat, iscol, &iscol_local));
3445   }
3446 
3447   PetscCall(ISGetLocalSize(iscol, &csize));
3448   PetscCall(MatCreateSubMatrix_MPIAIJ_nonscalable(mat, isrow, iscol_local, csize, call, newmat));
3449 
3450   if (call == MAT_INITIAL_MATRIX) {
3451     PetscCall(PetscObjectCompose((PetscObject)*newmat, "ISAllGather", (PetscObject)iscol_local));
3452     PetscCall(ISDestroy(&iscol_local));
3453   }
3454   PetscFunctionReturn(PETSC_SUCCESS);
3455 }
3456 
3457 /*@C
3458   MatCreateMPIAIJWithSeqAIJ - creates a `MATMPIAIJ` matrix using `MATSEQAIJ` matrices that contain the "diagonal"
3459   and "off-diagonal" part of the matrix in CSR format.
3460 
3461   Collective
3462 
3463   Input Parameters:
3464 + comm   - MPI communicator
3465 . M      - the global row size
3466 . N      - the global column size
3467 . A      - "diagonal" portion of matrix
3468 . B      - if garray is `NULL`, B should be the offdiag matrix using global col ids and of size N - if garray is not `NULL`, B should be the offdiag matrix using local col ids and of size garray
3469 - garray - either `NULL` or the global index of `B` columns. If not `NULL`, it should be allocated by `PetscMalloc1()` and will be owned by `mat` thereafter.
3470 
3471   Output Parameter:
3472 . mat - the matrix, with input `A` as its local diagonal matrix
3473 
3474   Level: advanced
3475 
3476   Notes:
3477   See `MatCreateAIJ()` for the definition of "diagonal" and "off-diagonal" portion of the matrix.
3478 
3479   `A` and `B` becomes part of output mat. The user cannot use `A` and `B` anymore.
3480 
3481   If `garray` is `NULL`, `B` will be compacted to use local indices. In this sense, `B`'s sparsity pattern (nonzerostate) will be changed. If `B` is a device matrix, we need to somehow also update
3482   `B`'s copy on device.  We do so by increasing `B`'s nonzerostate. In use of `B` on device, device matrix types should detect this change (ref. internal routines `MatSeqAIJCUSPARSECopyToGPU()` or
3483   `MatAssemblyEnd_SeqAIJKokkos()`) and will just destroy and then recreate the device copy of `B`. It is not optimal, but is easy to implement and less hacky. To avoid this overhead, try to compute `garray`
3484   yourself, see algorithms in the private function `MatSetUpMultiply_MPIAIJ()`.
3485 
3486   The `NULL`-ness of `garray` doesn't need to be collective, in other words, `garray` can be `NULL` on some processes while not on others.
3487 
3488 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MATSEQAIJ`, `MatCreateMPIAIJWithSplitArrays()`
3489 @*/
3490 PetscErrorCode MatCreateMPIAIJWithSeqAIJ(MPI_Comm comm, PetscInt M, PetscInt N, Mat A, Mat B, PetscInt *garray, Mat *mat)
3491 {
3492   PetscInt    m, n;
3493   MatType     mpi_mat_type;
3494   Mat_MPIAIJ *mpiaij;
3495   Mat         C;
3496 
3497   PetscFunctionBegin;
3498   PetscCall(MatCreate(comm, &C));
3499   PetscCall(MatGetSize(A, &m, &n));
3500   PetscCheck(m == B->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Am %" PetscInt_FMT " != Bm %" PetscInt_FMT, m, B->rmap->N);
3501   PetscCheck(A->rmap->bs == B->rmap->bs, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "A row bs %" PetscInt_FMT " != B row bs %" PetscInt_FMT, A->rmap->bs, B->rmap->bs);
3502 
3503   PetscCall(MatSetSizes(C, m, n, M, N));
3504   /* Determine the type of MPI matrix that should be created from the type of matrix A, which holds the "diagonal" portion. */
3505   PetscCall(MatGetMPIMatType_Private(A, &mpi_mat_type));
3506   PetscCall(MatSetType(C, mpi_mat_type));
3507   if (!garray) {
3508     const PetscScalar *ba;
3509 
3510     B->nonzerostate++;
3511     PetscCall(MatSeqAIJGetArrayRead(B, &ba)); /* Since we will destroy B's device copy, we need to make sure the host copy is up to date */
3512     PetscCall(MatSeqAIJRestoreArrayRead(B, &ba));
3513   }
3514 
3515   PetscCall(MatSetBlockSizes(C, A->rmap->bs, A->cmap->bs));
3516   PetscCall(PetscLayoutSetUp(C->rmap));
3517   PetscCall(PetscLayoutSetUp(C->cmap));
3518 
3519   mpiaij              = (Mat_MPIAIJ *)C->data;
3520   mpiaij->A           = A;
3521   mpiaij->B           = B;
3522   mpiaij->garray      = garray;
3523   C->preallocated     = PETSC_TRUE;
3524   C->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */
3525 
3526   PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
3527   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
3528   /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and
3529    also gets mpiaij->B compacted (if garray is NULL), with its col ids and size reduced
3530    */
3531   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
3532   PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE));
3533   PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3534   *mat = C;
3535   PetscFunctionReturn(PETSC_SUCCESS);
3536 }
3537 
3538 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat, PetscInt, const IS[], const IS[], MatReuse, PetscBool, Mat *);
3539 
3540 PetscErrorCode MatCreateSubMatrix_MPIAIJ_SameRowDist(Mat mat, IS isrow, IS iscol, IS iscol_local, MatReuse call, Mat *newmat)
3541 {
3542   PetscInt        i, m, n, rstart, row, rend, nz, j, bs, cbs;
3543   PetscInt       *ii, *jj, nlocal, *dlens, *olens, dlen, olen, jend, mglobal;
3544   Mat_MPIAIJ     *a = (Mat_MPIAIJ *)mat->data;
3545   Mat             M, Msub, B = a->B;
3546   MatScalar      *aa;
3547   Mat_SeqAIJ     *aij;
3548   PetscInt       *garray = a->garray, *colsub, Ncols;
3549   PetscInt        count, Bn = B->cmap->N, cstart = mat->cmap->rstart, cend = mat->cmap->rend;
3550   IS              iscol_sub, iscmap;
3551   const PetscInt *is_idx, *cmap;
3552   PetscBool       allcolumns = PETSC_FALSE;
3553   MPI_Comm        comm;
3554 
3555   PetscFunctionBegin;
3556   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3557   if (call == MAT_REUSE_MATRIX) {
3558     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubIScol", (PetscObject *)&iscol_sub));
3559     PetscCheck(iscol_sub, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "SubIScol passed in was not used before, cannot reuse");
3560     PetscCall(ISGetLocalSize(iscol_sub, &count));
3561 
3562     PetscCall(PetscObjectQuery((PetscObject)*newmat, "Subcmap", (PetscObject *)&iscmap));
3563     PetscCheck(iscmap, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Subcmap passed in was not used before, cannot reuse");
3564 
3565     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubMatrix", (PetscObject *)&Msub));
3566     PetscCheck(Msub, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3567 
3568     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol_sub, MAT_REUSE_MATRIX, PETSC_FALSE, &Msub));
3569 
3570   } else { /* call == MAT_INITIAL_MATRIX) */
3571     PetscBool flg;
3572 
3573     PetscCall(ISGetLocalSize(iscol, &n));
3574     PetscCall(ISGetSize(iscol, &Ncols));
3575 
3576     /* (1) iscol -> nonscalable iscol_local */
3577     /* Check for special case: each processor gets entire matrix columns */
3578     PetscCall(ISIdentity(iscol_local, &flg));
3579     if (flg && n == mat->cmap->N) allcolumns = PETSC_TRUE;
3580     PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &allcolumns, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
3581     if (allcolumns) {
3582       iscol_sub = iscol_local;
3583       PetscCall(PetscObjectReference((PetscObject)iscol_local));
3584       PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &iscmap));
3585 
3586     } else {
3587       /* (2) iscol_local -> iscol_sub and iscmap. Implementation below requires iscol_local be sorted, it can have duplicate indices */
3588       PetscInt *idx, *cmap1, k;
3589       PetscCall(PetscMalloc1(Ncols, &idx));
3590       PetscCall(PetscMalloc1(Ncols, &cmap1));
3591       PetscCall(ISGetIndices(iscol_local, &is_idx));
3592       count = 0;
3593       k     = 0;
3594       for (i = 0; i < Ncols; i++) {
3595         j = is_idx[i];
3596         if (j >= cstart && j < cend) {
3597           /* diagonal part of mat */
3598           idx[count]     = j;
3599           cmap1[count++] = i; /* column index in submat */
3600         } else if (Bn) {
3601           /* off-diagonal part of mat */
3602           if (j == garray[k]) {
3603             idx[count]     = j;
3604             cmap1[count++] = i; /* column index in submat */
3605           } else if (j > garray[k]) {
3606             while (j > garray[k] && k < Bn - 1) k++;
3607             if (j == garray[k]) {
3608               idx[count]     = j;
3609               cmap1[count++] = i; /* column index in submat */
3610             }
3611           }
3612         }
3613       }
3614       PetscCall(ISRestoreIndices(iscol_local, &is_idx));
3615 
3616       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, count, idx, PETSC_OWN_POINTER, &iscol_sub));
3617       PetscCall(ISGetBlockSize(iscol, &cbs));
3618       PetscCall(ISSetBlockSize(iscol_sub, cbs));
3619 
3620       PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)iscol_local), count, cmap1, PETSC_OWN_POINTER, &iscmap));
3621     }
3622 
3623     /* (3) Create sequential Msub */
3624     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol_sub, MAT_INITIAL_MATRIX, allcolumns, &Msub));
3625   }
3626 
3627   PetscCall(ISGetLocalSize(iscol_sub, &count));
3628   aij = (Mat_SeqAIJ *)Msub->data;
3629   ii  = aij->i;
3630   PetscCall(ISGetIndices(iscmap, &cmap));
3631 
3632   /*
3633       m - number of local rows
3634       Ncols - number of columns (same on all processors)
3635       rstart - first row in new global matrix generated
3636   */
3637   PetscCall(MatGetSize(Msub, &m, NULL));
3638 
3639   if (call == MAT_INITIAL_MATRIX) {
3640     /* (4) Create parallel newmat */
3641     PetscMPIInt rank, size;
3642     PetscInt    csize;
3643 
3644     PetscCallMPI(MPI_Comm_size(comm, &size));
3645     PetscCallMPI(MPI_Comm_rank(comm, &rank));
3646 
3647     /*
3648         Determine the number of non-zeros in the diagonal and off-diagonal
3649         portions of the matrix in order to do correct preallocation
3650     */
3651 
3652     /* first get start and end of "diagonal" columns */
3653     PetscCall(ISGetLocalSize(iscol, &csize));
3654     if (csize == PETSC_DECIDE) {
3655       PetscCall(ISGetSize(isrow, &mglobal));
3656       if (mglobal == Ncols) { /* square matrix */
3657         nlocal = m;
3658       } else {
3659         nlocal = Ncols / size + ((Ncols % size) > rank);
3660       }
3661     } else {
3662       nlocal = csize;
3663     }
3664     PetscCallMPI(MPI_Scan(&nlocal, &rend, 1, MPIU_INT, MPI_SUM, comm));
3665     rstart = rend - nlocal;
3666     PetscCheck(rank != size - 1 || rend == Ncols, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Local column sizes %" PetscInt_FMT " do not add up to total number of columns %" PetscInt_FMT, rend, Ncols);
3667 
3668     /* next, compute all the lengths */
3669     jj = aij->j;
3670     PetscCall(PetscMalloc1(2 * m + 1, &dlens));
3671     olens = dlens + m;
3672     for (i = 0; i < m; i++) {
3673       jend = ii[i + 1] - ii[i];
3674       olen = 0;
3675       dlen = 0;
3676       for (j = 0; j < jend; j++) {
3677         if (cmap[*jj] < rstart || cmap[*jj] >= rend) olen++;
3678         else dlen++;
3679         jj++;
3680       }
3681       olens[i] = olen;
3682       dlens[i] = dlen;
3683     }
3684 
3685     PetscCall(ISGetBlockSize(isrow, &bs));
3686     PetscCall(ISGetBlockSize(iscol, &cbs));
3687 
3688     PetscCall(MatCreate(comm, &M));
3689     PetscCall(MatSetSizes(M, m, nlocal, PETSC_DECIDE, Ncols));
3690     PetscCall(MatSetBlockSizes(M, bs, cbs));
3691     PetscCall(MatSetType(M, ((PetscObject)mat)->type_name));
3692     PetscCall(MatMPIAIJSetPreallocation(M, 0, dlens, 0, olens));
3693     PetscCall(PetscFree(dlens));
3694 
3695   } else { /* call == MAT_REUSE_MATRIX */
3696     M = *newmat;
3697     PetscCall(MatGetLocalSize(M, &i, NULL));
3698     PetscCheck(i == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Previous matrix must be same size/layout as request");
3699     PetscCall(MatZeroEntries(M));
3700     /*
3701          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
3702        rather than the slower MatSetValues().
3703     */
3704     M->was_assembled = PETSC_TRUE;
3705     M->assembled     = PETSC_FALSE;
3706   }
3707 
3708   /* (5) Set values of Msub to *newmat */
3709   PetscCall(PetscMalloc1(count, &colsub));
3710   PetscCall(MatGetOwnershipRange(M, &rstart, NULL));
3711 
3712   jj = aij->j;
3713   PetscCall(MatSeqAIJGetArrayRead(Msub, (const PetscScalar **)&aa));
3714   for (i = 0; i < m; i++) {
3715     row = rstart + i;
3716     nz  = ii[i + 1] - ii[i];
3717     for (j = 0; j < nz; j++) colsub[j] = cmap[jj[j]];
3718     PetscCall(MatSetValues_MPIAIJ(M, 1, &row, nz, colsub, aa, INSERT_VALUES));
3719     jj += nz;
3720     aa += nz;
3721   }
3722   PetscCall(MatSeqAIJRestoreArrayRead(Msub, (const PetscScalar **)&aa));
3723   PetscCall(ISRestoreIndices(iscmap, &cmap));
3724 
3725   PetscCall(MatAssemblyBegin(M, MAT_FINAL_ASSEMBLY));
3726   PetscCall(MatAssemblyEnd(M, MAT_FINAL_ASSEMBLY));
3727 
3728   PetscCall(PetscFree(colsub));
3729 
3730   /* save Msub, iscol_sub and iscmap used in processor for next request */
3731   if (call == MAT_INITIAL_MATRIX) {
3732     *newmat = M;
3733     PetscCall(PetscObjectCompose((PetscObject)*newmat, "SubMatrix", (PetscObject)Msub));
3734     PetscCall(MatDestroy(&Msub));
3735 
3736     PetscCall(PetscObjectCompose((PetscObject)*newmat, "SubIScol", (PetscObject)iscol_sub));
3737     PetscCall(ISDestroy(&iscol_sub));
3738 
3739     PetscCall(PetscObjectCompose((PetscObject)*newmat, "Subcmap", (PetscObject)iscmap));
3740     PetscCall(ISDestroy(&iscmap));
3741 
3742     if (iscol_local) {
3743       PetscCall(PetscObjectCompose((PetscObject)*newmat, "ISAllGather", (PetscObject)iscol_local));
3744       PetscCall(ISDestroy(&iscol_local));
3745     }
3746   }
3747   PetscFunctionReturn(PETSC_SUCCESS);
3748 }
3749 
3750 /*
3751     Not great since it makes two copies of the submatrix, first an SeqAIJ
3752   in local and then by concatenating the local matrices the end result.
3753   Writing it directly would be much like MatCreateSubMatrices_MPIAIJ()
3754 
3755   This requires a sequential iscol with all indices.
3756 */
3757 PetscErrorCode MatCreateSubMatrix_MPIAIJ_nonscalable(Mat mat, IS isrow, IS iscol, PetscInt csize, MatReuse call, Mat *newmat)
3758 {
3759   PetscMPIInt rank, size;
3760   PetscInt    i, m, n, rstart, row, rend, nz, *cwork, j, bs, cbs;
3761   PetscInt   *ii, *jj, nlocal, *dlens, *olens, dlen, olen, jend, mglobal;
3762   Mat         M, Mreuse;
3763   MatScalar  *aa, *vwork;
3764   MPI_Comm    comm;
3765   Mat_SeqAIJ *aij;
3766   PetscBool   colflag, allcolumns = PETSC_FALSE;
3767 
3768   PetscFunctionBegin;
3769   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
3770   PetscCallMPI(MPI_Comm_rank(comm, &rank));
3771   PetscCallMPI(MPI_Comm_size(comm, &size));
3772 
3773   /* Check for special case: each processor gets entire matrix columns */
3774   PetscCall(ISIdentity(iscol, &colflag));
3775   PetscCall(ISGetLocalSize(iscol, &n));
3776   if (colflag && n == mat->cmap->N) allcolumns = PETSC_TRUE;
3777   PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &allcolumns, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
3778 
3779   if (call == MAT_REUSE_MATRIX) {
3780     PetscCall(PetscObjectQuery((PetscObject)*newmat, "SubMatrix", (PetscObject *)&Mreuse));
3781     PetscCheck(Mreuse, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Submatrix passed in was not used before, cannot reuse");
3782     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol, MAT_REUSE_MATRIX, allcolumns, &Mreuse));
3783   } else {
3784     PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(mat, 1, &isrow, &iscol, MAT_INITIAL_MATRIX, allcolumns, &Mreuse));
3785   }
3786 
3787   /*
3788       m - number of local rows
3789       n - number of columns (same on all processors)
3790       rstart - first row in new global matrix generated
3791   */
3792   PetscCall(MatGetSize(Mreuse, &m, &n));
3793   PetscCall(MatGetBlockSizes(Mreuse, &bs, &cbs));
3794   if (call == MAT_INITIAL_MATRIX) {
3795     aij = (Mat_SeqAIJ *)Mreuse->data;
3796     ii  = aij->i;
3797     jj  = aij->j;
3798 
3799     /*
3800         Determine the number of non-zeros in the diagonal and off-diagonal
3801         portions of the matrix in order to do correct preallocation
3802     */
3803 
3804     /* first get start and end of "diagonal" columns */
3805     if (csize == PETSC_DECIDE) {
3806       PetscCall(ISGetSize(isrow, &mglobal));
3807       if (mglobal == n) { /* square matrix */
3808         nlocal = m;
3809       } else {
3810         nlocal = n / size + ((n % size) > rank);
3811       }
3812     } else {
3813       nlocal = csize;
3814     }
3815     PetscCallMPI(MPI_Scan(&nlocal, &rend, 1, MPIU_INT, MPI_SUM, comm));
3816     rstart = rend - nlocal;
3817     PetscCheck(rank != size - 1 || rend == n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Local column sizes %" PetscInt_FMT " do not add up to total number of columns %" PetscInt_FMT, rend, n);
3818 
3819     /* next, compute all the lengths */
3820     PetscCall(PetscMalloc1(2 * m + 1, &dlens));
3821     olens = dlens + m;
3822     for (i = 0; i < m; i++) {
3823       jend = ii[i + 1] - ii[i];
3824       olen = 0;
3825       dlen = 0;
3826       for (j = 0; j < jend; j++) {
3827         if (*jj < rstart || *jj >= rend) olen++;
3828         else dlen++;
3829         jj++;
3830       }
3831       olens[i] = olen;
3832       dlens[i] = dlen;
3833     }
3834     PetscCall(MatCreate(comm, &M));
3835     PetscCall(MatSetSizes(M, m, nlocal, PETSC_DECIDE, n));
3836     PetscCall(MatSetBlockSizes(M, bs, cbs));
3837     PetscCall(MatSetType(M, ((PetscObject)mat)->type_name));
3838     PetscCall(MatMPIAIJSetPreallocation(M, 0, dlens, 0, olens));
3839     PetscCall(PetscFree(dlens));
3840   } else {
3841     PetscInt ml, nl;
3842 
3843     M = *newmat;
3844     PetscCall(MatGetLocalSize(M, &ml, &nl));
3845     PetscCheck(ml == m, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Previous matrix must be same size/layout as request");
3846     PetscCall(MatZeroEntries(M));
3847     /*
3848          The next two lines are needed so we may call MatSetValues_MPIAIJ() below directly,
3849        rather than the slower MatSetValues().
3850     */
3851     M->was_assembled = PETSC_TRUE;
3852     M->assembled     = PETSC_FALSE;
3853   }
3854   PetscCall(MatGetOwnershipRange(M, &rstart, &rend));
3855   aij = (Mat_SeqAIJ *)Mreuse->data;
3856   ii  = aij->i;
3857   jj  = aij->j;
3858 
3859   /* trigger copy to CPU if needed */
3860   PetscCall(MatSeqAIJGetArrayRead(Mreuse, (const PetscScalar **)&aa));
3861   for (i = 0; i < m; i++) {
3862     row   = rstart + i;
3863     nz    = ii[i + 1] - ii[i];
3864     cwork = jj;
3865     jj    = PetscSafePointerPlusOffset(jj, nz);
3866     vwork = aa;
3867     aa    = PetscSafePointerPlusOffset(aa, nz);
3868     PetscCall(MatSetValues_MPIAIJ(M, 1, &row, nz, cwork, vwork, INSERT_VALUES));
3869   }
3870   PetscCall(MatSeqAIJRestoreArrayRead(Mreuse, (const PetscScalar **)&aa));
3871 
3872   PetscCall(MatAssemblyBegin(M, MAT_FINAL_ASSEMBLY));
3873   PetscCall(MatAssemblyEnd(M, MAT_FINAL_ASSEMBLY));
3874   *newmat = M;
3875 
3876   /* save submatrix used in processor for next request */
3877   if (call == MAT_INITIAL_MATRIX) {
3878     PetscCall(PetscObjectCompose((PetscObject)M, "SubMatrix", (PetscObject)Mreuse));
3879     PetscCall(MatDestroy(&Mreuse));
3880   }
3881   PetscFunctionReturn(PETSC_SUCCESS);
3882 }
3883 
3884 static PetscErrorCode MatMPIAIJSetPreallocationCSR_MPIAIJ(Mat B, const PetscInt Ii[], const PetscInt J[], const PetscScalar v[])
3885 {
3886   PetscInt        m, cstart, cend, j, nnz, i, d, *ld;
3887   PetscInt       *d_nnz, *o_nnz, nnz_max = 0, rstart, ii, irstart;
3888   const PetscInt *JJ;
3889   PetscBool       nooffprocentries;
3890   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)B->data;
3891 
3892   PetscFunctionBegin;
3893   PetscCall(PetscLayoutSetUp(B->rmap));
3894   PetscCall(PetscLayoutSetUp(B->cmap));
3895   m       = B->rmap->n;
3896   cstart  = B->cmap->rstart;
3897   cend    = B->cmap->rend;
3898   rstart  = B->rmap->rstart;
3899   irstart = Ii[0];
3900 
3901   PetscCall(PetscCalloc2(m, &d_nnz, m, &o_nnz));
3902 
3903   if (PetscDefined(USE_DEBUG)) {
3904     for (i = 0; i < m; i++) {
3905       nnz = Ii[i + 1] - Ii[i];
3906       JJ  = PetscSafePointerPlusOffset(J, Ii[i] - irstart);
3907       PetscCheck(nnz >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Local row %" PetscInt_FMT " has a negative %" PetscInt_FMT " number of columns", i, nnz);
3908       PetscCheck(!nnz || !(JJ[0] < 0), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Row %" PetscInt_FMT " starts with negative column index %" PetscInt_FMT, i, JJ[0]);
3909       PetscCheck(!nnz || !(JJ[nnz - 1] >= B->cmap->N), PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Row %" PetscInt_FMT " ends with too large a column index %" PetscInt_FMT " (max allowed %" PetscInt_FMT ")", i, JJ[nnz - 1], B->cmap->N);
3910     }
3911   }
3912 
3913   for (i = 0; i < m; i++) {
3914     nnz     = Ii[i + 1] - Ii[i];
3915     JJ      = PetscSafePointerPlusOffset(J, Ii[i] - irstart);
3916     nnz_max = PetscMax(nnz_max, nnz);
3917     d       = 0;
3918     for (j = 0; j < nnz; j++) {
3919       if (cstart <= JJ[j] && JJ[j] < cend) d++;
3920     }
3921     d_nnz[i] = d;
3922     o_nnz[i] = nnz - d;
3923   }
3924   PetscCall(MatMPIAIJSetPreallocation(B, 0, d_nnz, 0, o_nnz));
3925   PetscCall(PetscFree2(d_nnz, o_nnz));
3926 
3927   for (i = 0; i < m; i++) {
3928     ii = i + rstart;
3929     PetscCall(MatSetValues_MPIAIJ(B, 1, &ii, Ii[i + 1] - Ii[i], PetscSafePointerPlusOffset(J, Ii[i] - irstart), PetscSafePointerPlusOffset(v, Ii[i] - irstart), INSERT_VALUES));
3930   }
3931   nooffprocentries    = B->nooffprocentries;
3932   B->nooffprocentries = PETSC_TRUE;
3933   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
3934   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
3935   B->nooffprocentries = nooffprocentries;
3936 
3937   /* count number of entries below block diagonal */
3938   PetscCall(PetscFree(Aij->ld));
3939   PetscCall(PetscCalloc1(m, &ld));
3940   Aij->ld = ld;
3941   for (i = 0; i < m; i++) {
3942     nnz = Ii[i + 1] - Ii[i];
3943     j   = 0;
3944     while (j < nnz && J[j] < cstart) j++;
3945     ld[i] = j;
3946     if (J) J += nnz;
3947   }
3948 
3949   PetscCall(MatSetOption(B, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3950   PetscFunctionReturn(PETSC_SUCCESS);
3951 }
3952 
3953 /*@
3954   MatMPIAIJSetPreallocationCSR - Allocates memory for a sparse parallel matrix in `MATAIJ` format
3955   (the default parallel PETSc format).
3956 
3957   Collective
3958 
3959   Input Parameters:
3960 + B - the matrix
3961 . i - the indices into `j` for the start of each local row (indices start with zero)
3962 . j - the column indices for each local row (indices start with zero)
3963 - v - optional values in the matrix
3964 
3965   Level: developer
3966 
3967   Notes:
3968   The `i`, `j`, and `v` arrays ARE copied by this routine into the internal format used by PETSc;
3969   thus you CANNOT change the matrix entries by changing the values of `v` after you have
3970   called this routine. Use `MatCreateMPIAIJWithSplitArrays()` to avoid needing to copy the arrays.
3971 
3972   The `i` and `j` indices are 0 based, and `i` indices are indices corresponding to the local `j` array.
3973 
3974   A convenience routine for this functionality is `MatCreateMPIAIJWithArrays()`.
3975 
3976   You can update the matrix with new numerical values using `MatUpdateMPIAIJWithArrays()` after this call if the column indices in `j` are sorted.
3977 
3978   If you do **not** use `MatUpdateMPIAIJWithArrays()`, the column indices in `j` do not need to be sorted. If you will use
3979   `MatUpdateMPIAIJWithArrays()`, the column indices **must** be sorted.
3980 
3981   The format which is used for the sparse matrix input, is equivalent to a
3982   row-major ordering.. i.e for the following matrix, the input data expected is
3983   as shown
3984 .vb
3985         1 0 0
3986         2 0 3     P0
3987        -------
3988         4 5 6     P1
3989 
3990      Process0 [P0] rows_owned=[0,1]
3991         i =  {0,1,3}  [size = nrow+1  = 2+1]
3992         j =  {0,0,2}  [size = 3]
3993         v =  {1,2,3}  [size = 3]
3994 
3995      Process1 [P1] rows_owned=[2]
3996         i =  {0,3}    [size = nrow+1  = 1+1]
3997         j =  {0,1,2}  [size = 3]
3998         v =  {4,5,6}  [size = 3]
3999 .ve
4000 
4001 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatCreateAIJ()`,
4002           `MatCreateSeqAIJWithArrays()`, `MatCreateMPIAIJWithSplitArrays()`, `MatCreateMPIAIJWithArrays()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4003 @*/
4004 PetscErrorCode MatMPIAIJSetPreallocationCSR(Mat B, const PetscInt i[], const PetscInt j[], const PetscScalar v[])
4005 {
4006   PetscFunctionBegin;
4007   PetscTryMethod(B, "MatMPIAIJSetPreallocationCSR_C", (Mat, const PetscInt[], const PetscInt[], const PetscScalar[]), (B, i, j, v));
4008   PetscFunctionReturn(PETSC_SUCCESS);
4009 }
4010 
4011 /*@
4012   MatMPIAIJSetPreallocation - Preallocates memory for a sparse parallel matrix in `MATMPIAIJ` format
4013   (the default parallel PETSc format).  For good matrix assembly performance
4014   the user should preallocate the matrix storage by setting the parameters
4015   `d_nz` (or `d_nnz`) and `o_nz` (or `o_nnz`).
4016 
4017   Collective
4018 
4019   Input Parameters:
4020 + B     - the matrix
4021 . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
4022            (same value is used for all local rows)
4023 . d_nnz - array containing the number of nonzeros in the various rows of the
4024            DIAGONAL portion of the local submatrix (possibly different for each row)
4025            or `NULL` (`PETSC_NULL_INTEGER` in Fortran), if `d_nz` is used to specify the nonzero structure.
4026            The size of this array is equal to the number of local rows, i.e 'm'.
4027            For matrices that will be factored, you must leave room for (and set)
4028            the diagonal entry even if it is zero.
4029 . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
4030            submatrix (same value is used for all local rows).
4031 - o_nnz - array containing the number of nonzeros in the various rows of the
4032            OFF-DIAGONAL portion of the local submatrix (possibly different for
4033            each row) or `NULL` (`PETSC_NULL_INTEGER` in Fortran), if `o_nz` is used to specify the nonzero
4034            structure. The size of this array is equal to the number
4035            of local rows, i.e 'm'.
4036 
4037   Example Usage:
4038   Consider the following 8x8 matrix with 34 non-zero values, that is
4039   assembled across 3 processors. Lets assume that proc0 owns 3 rows,
4040   proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
4041   as follows
4042 
4043 .vb
4044             1  2  0  |  0  3  0  |  0  4
4045     Proc0   0  5  6  |  7  0  0  |  8  0
4046             9  0 10  | 11  0  0  | 12  0
4047     -------------------------------------
4048            13  0 14  | 15 16 17  |  0  0
4049     Proc1   0 18  0  | 19 20 21  |  0  0
4050             0  0  0  | 22 23  0  | 24  0
4051     -------------------------------------
4052     Proc2  25 26 27  |  0  0 28  | 29  0
4053            30  0  0  | 31 32 33  |  0 34
4054 .ve
4055 
4056   This can be represented as a collection of submatrices as
4057 .vb
4058       A B C
4059       D E F
4060       G H I
4061 .ve
4062 
4063   Where the submatrices A,B,C are owned by proc0, D,E,F are
4064   owned by proc1, G,H,I are owned by proc2.
4065 
4066   The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4067   The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4068   The 'M','N' parameters are 8,8, and have the same values on all procs.
4069 
4070   The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
4071   submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
4072   corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
4073   Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
4074   part as `MATSEQAIJ` matrices. For example, proc1 will store [E] as a `MATSEQAIJ`
4075   matrix, and [DF] as another `MATSEQAIJ` matrix.
4076 
4077   When `d_nz`, `o_nz` parameters are specified, `d_nz` storage elements are
4078   allocated for every row of the local DIAGONAL submatrix, and `o_nz`
4079   storage locations are allocated for every row of the OFF-DIAGONAL submatrix.
4080   One way to choose `d_nz` and `o_nz` is to use the maximum number of nonzeros over
4081   the local rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
4082   In this case, the values of `d_nz`, `o_nz` are
4083 .vb
4084      proc0  dnz = 2, o_nz = 2
4085      proc1  dnz = 3, o_nz = 2
4086      proc2  dnz = 1, o_nz = 4
4087 .ve
4088   We are allocating `m`*(`d_nz`+`o_nz`) storage locations for every proc. This
4089   translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
4090   for proc3. i.e we are using 12+15+10=37 storage locations to store
4091   34 values.
4092 
4093   When `d_nnz`, `o_nnz` parameters are specified, the storage is specified
4094   for every row, corresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
4095   In the above case the values for `d_nnz`, `o_nnz` are
4096 .vb
4097      proc0 d_nnz = [2,2,2] and o_nnz = [2,2,2]
4098      proc1 d_nnz = [3,3,2] and o_nnz = [2,1,1]
4099      proc2 d_nnz = [1,1]   and o_nnz = [4,4]
4100 .ve
4101   Here the space allocated is sum of all the above values i.e 34, and
4102   hence pre-allocation is perfect.
4103 
4104   Level: intermediate
4105 
4106   Notes:
4107   If the *_nnz parameter is given then the *_nz parameter is ignored
4108 
4109   The `MATAIJ` format, also called compressed row storage (CSR), is compatible with standard Fortran
4110   storage.  The stored row and column indices begin with zero.
4111   See [Sparse Matrices](sec_matsparse) for details.
4112 
4113   The parallel matrix is partitioned such that the first m0 rows belong to
4114   process 0, the next m1 rows belong to process 1, the next m2 rows belong
4115   to process 2 etc.. where m0,m1,m2... are the input parameter 'm'.
4116 
4117   The DIAGONAL portion of the local submatrix of a processor can be defined
4118   as the submatrix which is obtained by extraction the part corresponding to
4119   the rows r1-r2 and columns c1-c2 of the global matrix, where r1 is the
4120   first row that belongs to the processor, r2 is the last row belonging to
4121   the this processor, and c1-c2 is range of indices of the local part of a
4122   vector suitable for applying the matrix to.  This is an mxn matrix.  In the
4123   common case of a square matrix, the row and column ranges are the same and
4124   the DIAGONAL part is also square. The remaining portion of the local
4125   submatrix (mxN) constitute the OFF-DIAGONAL portion.
4126 
4127   If `o_nnz` and `d_nnz` are specified, then `o_nz` and `d_nz` are ignored.
4128 
4129   You can call `MatGetInfo()` to get information on how effective the preallocation was;
4130   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
4131   You can also run with the option `-info` and look for messages with the string
4132   malloc in them to see if additional memory allocation was needed.
4133 
4134 .seealso: [](ch_matrices), `Mat`, [Sparse Matrices](sec_matsparse), `MATMPIAIJ`, `MATAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateAIJ()`, `MatMPIAIJSetPreallocationCSR()`,
4135           `MatGetInfo()`, `PetscSplitOwnership()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4136 @*/
4137 PetscErrorCode MatMPIAIJSetPreallocation(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
4138 {
4139   PetscFunctionBegin;
4140   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
4141   PetscValidType(B, 1);
4142   PetscTryMethod(B, "MatMPIAIJSetPreallocation_C", (Mat, PetscInt, const PetscInt[], PetscInt, const PetscInt[]), (B, d_nz, d_nnz, o_nz, o_nnz));
4143   PetscFunctionReturn(PETSC_SUCCESS);
4144 }
4145 
4146 /*@
4147   MatCreateMPIAIJWithArrays - creates a `MATMPIAIJ` matrix using arrays that contain in standard
4148   CSR format for the local rows.
4149 
4150   Collective
4151 
4152   Input Parameters:
4153 + comm - MPI communicator
4154 . m    - number of local rows (Cannot be `PETSC_DECIDE`)
4155 . n    - This value should be the same as the local size used in creating the
4156          x vector for the matrix-vector product $ y = Ax$. (or `PETSC_DECIDE` to have
4157          calculated if `N` is given) For square matrices n is almost always `m`.
4158 . M    - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
4159 . N    - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
4160 . i    - row indices (of length m+1); that is i[0] = 0, i[row] = i[row-1] + number of elements in that row of the matrix
4161 . j    - global column indices
4162 - a    - optional matrix values
4163 
4164   Output Parameter:
4165 . mat - the matrix
4166 
4167   Level: intermediate
4168 
4169   Notes:
4170   The `i`, `j`, and `a` arrays ARE copied by this routine into the internal format used by PETSc;
4171   thus you CANNOT change the matrix entries by changing the values of `a[]` after you have
4172   called this routine. Use `MatCreateMPIAIJWithSplitArrays()` to avoid needing to copy the arrays.
4173 
4174   The `i` and `j` indices are 0 based, and `i` indices are indices corresponding to the local `j` array.
4175 
4176   Once you have created the matrix you can update it with new numerical values using `MatUpdateMPIAIJWithArray()`
4177 
4178   If you do **not** use `MatUpdateMPIAIJWithArray()`, the column indices in `j` do not need to be sorted. If you will use
4179   `MatUpdateMPIAIJWithArrays()`, the column indices **must** be sorted.
4180 
4181   The format which is used for the sparse matrix input, is equivalent to a
4182   row-major ordering, i.e., for the following matrix, the input data expected is
4183   as shown
4184 .vb
4185         1 0 0
4186         2 0 3     P0
4187        -------
4188         4 5 6     P1
4189 
4190      Process0 [P0] rows_owned=[0,1]
4191         i =  {0,1,3}  [size = nrow+1  = 2+1]
4192         j =  {0,0,2}  [size = 3]
4193         v =  {1,2,3}  [size = 3]
4194 
4195      Process1 [P1] rows_owned=[2]
4196         i =  {0,3}    [size = nrow+1  = 1+1]
4197         j =  {0,1,2}  [size = 3]
4198         v =  {4,5,6}  [size = 3]
4199 .ve
4200 
4201 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4202           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArray()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4203 @*/
4204 PetscErrorCode MatCreateMPIAIJWithArrays(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, const PetscInt i[], const PetscInt j[], const PetscScalar a[], Mat *mat)
4205 {
4206   PetscFunctionBegin;
4207   PetscCheck(!i || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
4208   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
4209   PetscCall(MatCreate(comm, mat));
4210   PetscCall(MatSetSizes(*mat, m, n, M, N));
4211   /* PetscCall(MatSetBlockSizes(M,bs,cbs)); */
4212   PetscCall(MatSetType(*mat, MATMPIAIJ));
4213   PetscCall(MatMPIAIJSetPreallocationCSR(*mat, i, j, a));
4214   PetscFunctionReturn(PETSC_SUCCESS);
4215 }
4216 
4217 /*@
4218   MatUpdateMPIAIJWithArrays - updates a `MATMPIAIJ` matrix using arrays that contain in standard
4219   CSR format for the local rows. Only the numerical values are updated the other arrays must be identical to what was passed
4220   from `MatCreateMPIAIJWithArrays()`
4221 
4222   Deprecated: Use `MatUpdateMPIAIJWithArray()`
4223 
4224   Collective
4225 
4226   Input Parameters:
4227 + mat - the matrix
4228 . m   - number of local rows (Cannot be `PETSC_DECIDE`)
4229 . n   - This value should be the same as the local size used in creating the
4230        x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
4231        calculated if N is given) For square matrices n is almost always m.
4232 . M   - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
4233 . N   - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
4234 . Ii  - row indices; that is Ii[0] = 0, Ii[row] = Ii[row-1] + number of elements in that row of the matrix
4235 . J   - column indices
4236 - v   - matrix values
4237 
4238   Level: deprecated
4239 
4240 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4241           `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArray()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4242 @*/
4243 PetscErrorCode MatUpdateMPIAIJWithArrays(Mat mat, PetscInt m, PetscInt n, PetscInt M, PetscInt N, const PetscInt Ii[], const PetscInt J[], const PetscScalar v[])
4244 {
4245   PetscInt        nnz, i;
4246   PetscBool       nooffprocentries;
4247   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)mat->data;
4248   Mat_SeqAIJ     *Ad  = (Mat_SeqAIJ *)Aij->A->data;
4249   PetscScalar    *ad, *ao;
4250   PetscInt        ldi, Iii, md;
4251   const PetscInt *Adi = Ad->i;
4252   PetscInt       *ld  = Aij->ld;
4253 
4254   PetscFunctionBegin;
4255   PetscCheck(Ii[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
4256   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
4257   PetscCheck(m == mat->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Local number of rows cannot change from call to MatUpdateMPIAIJWithArrays()");
4258   PetscCheck(n == mat->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Local number of columns cannot change from call to MatUpdateMPIAIJWithArrays()");
4259 
4260   PetscCall(MatSeqAIJGetArrayWrite(Aij->A, &ad));
4261   PetscCall(MatSeqAIJGetArrayWrite(Aij->B, &ao));
4262 
4263   for (i = 0; i < m; i++) {
4264     if (PetscDefined(USE_DEBUG)) {
4265       for (PetscInt j = Ii[i] + 1; j < Ii[i + 1]; ++j) {
4266         PetscCheck(J[j] >= J[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column entry number %" PetscInt_FMT " (actual column %" PetscInt_FMT ") in row %" PetscInt_FMT " is not sorted", j - Ii[i], J[j], i);
4267         PetscCheck(J[j] != J[j - 1], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column entry number %" PetscInt_FMT " (actual column %" PetscInt_FMT ") in row %" PetscInt_FMT " is identical to previous entry", j - Ii[i], J[j], i);
4268       }
4269     }
4270     nnz = Ii[i + 1] - Ii[i];
4271     Iii = Ii[i];
4272     ldi = ld[i];
4273     md  = Adi[i + 1] - Adi[i];
4274     PetscCall(PetscArraycpy(ao, v + Iii, ldi));
4275     PetscCall(PetscArraycpy(ad, v + Iii + ldi, md));
4276     PetscCall(PetscArraycpy(ao + ldi, v + Iii + ldi + md, nnz - ldi - md));
4277     ad += md;
4278     ao += nnz - md;
4279   }
4280   nooffprocentries      = mat->nooffprocentries;
4281   mat->nooffprocentries = PETSC_TRUE;
4282   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->A, &ad));
4283   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->B, &ao));
4284   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->A));
4285   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->B));
4286   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
4287   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
4288   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
4289   mat->nooffprocentries = nooffprocentries;
4290   PetscFunctionReturn(PETSC_SUCCESS);
4291 }
4292 
4293 /*@
4294   MatUpdateMPIAIJWithArray - updates an `MATMPIAIJ` matrix using an array that contains the nonzero values
4295 
4296   Collective
4297 
4298   Input Parameters:
4299 + mat - the matrix
4300 - v   - matrix values, stored by row
4301 
4302   Level: intermediate
4303 
4304   Notes:
4305   The matrix must have been obtained with `MatCreateMPIAIJWithArrays()` or `MatMPIAIJSetPreallocationCSR()`
4306 
4307   The column indices in the call to `MatCreateMPIAIJWithArrays()` or `MatMPIAIJSetPreallocationCSR()` must have been sorted for this call to work correctly
4308 
4309 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4310           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithSplitArrays()`, `MatUpdateMPIAIJWithArrays()`, `MatSetPreallocationCOO()`, `MatSetValuesCOO()`
4311 @*/
4312 PetscErrorCode MatUpdateMPIAIJWithArray(Mat mat, const PetscScalar v[])
4313 {
4314   PetscInt        nnz, i, m;
4315   PetscBool       nooffprocentries;
4316   Mat_MPIAIJ     *Aij = (Mat_MPIAIJ *)mat->data;
4317   Mat_SeqAIJ     *Ad  = (Mat_SeqAIJ *)Aij->A->data;
4318   Mat_SeqAIJ     *Ao  = (Mat_SeqAIJ *)Aij->B->data;
4319   PetscScalar    *ad, *ao;
4320   const PetscInt *Adi = Ad->i, *Adj = Ao->i;
4321   PetscInt        ldi, Iii, md;
4322   PetscInt       *ld = Aij->ld;
4323 
4324   PetscFunctionBegin;
4325   m = mat->rmap->n;
4326 
4327   PetscCall(MatSeqAIJGetArrayWrite(Aij->A, &ad));
4328   PetscCall(MatSeqAIJGetArrayWrite(Aij->B, &ao));
4329   Iii = 0;
4330   for (i = 0; i < m; i++) {
4331     nnz = Adi[i + 1] - Adi[i] + Adj[i + 1] - Adj[i];
4332     ldi = ld[i];
4333     md  = Adi[i + 1] - Adi[i];
4334     PetscCall(PetscArraycpy(ad, v + Iii + ldi, md));
4335     ad += md;
4336     if (ao) {
4337       PetscCall(PetscArraycpy(ao, v + Iii, ldi));
4338       PetscCall(PetscArraycpy(ao + ldi, v + Iii + ldi + md, nnz - ldi - md));
4339       ao += nnz - md;
4340     }
4341     Iii += nnz;
4342   }
4343   nooffprocentries      = mat->nooffprocentries;
4344   mat->nooffprocentries = PETSC_TRUE;
4345   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->A, &ad));
4346   PetscCall(MatSeqAIJRestoreArrayWrite(Aij->B, &ao));
4347   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->A));
4348   PetscCall(PetscObjectStateIncrease((PetscObject)Aij->B));
4349   PetscCall(PetscObjectStateIncrease((PetscObject)mat));
4350   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
4351   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
4352   mat->nooffprocentries = nooffprocentries;
4353   PetscFunctionReturn(PETSC_SUCCESS);
4354 }
4355 
4356 /*@
4357   MatCreateAIJ - Creates a sparse parallel matrix in `MATAIJ` format
4358   (the default parallel PETSc format).  For good matrix assembly performance
4359   the user should preallocate the matrix storage by setting the parameters
4360   `d_nz` (or `d_nnz`) and `o_nz` (or `o_nnz`).
4361 
4362   Collective
4363 
4364   Input Parameters:
4365 + comm  - MPI communicator
4366 . m     - number of local rows (or `PETSC_DECIDE` to have calculated if M is given)
4367           This value should be the same as the local size used in creating the
4368           y vector for the matrix-vector product y = Ax.
4369 . n     - This value should be the same as the local size used in creating the
4370           x vector for the matrix-vector product y = Ax. (or `PETSC_DECIDE` to have
4371           calculated if N is given) For square matrices n is almost always m.
4372 . M     - number of global rows (or `PETSC_DETERMINE` to have calculated if m is given)
4373 . N     - number of global columns (or `PETSC_DETERMINE` to have calculated if n is given)
4374 . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
4375           (same value is used for all local rows)
4376 . d_nnz - array containing the number of nonzeros in the various rows of the
4377           DIAGONAL portion of the local submatrix (possibly different for each row)
4378           or `NULL`, if `d_nz` is used to specify the nonzero structure.
4379           The size of this array is equal to the number of local rows, i.e 'm'.
4380 . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
4381           submatrix (same value is used for all local rows).
4382 - o_nnz - array containing the number of nonzeros in the various rows of the
4383           OFF-DIAGONAL portion of the local submatrix (possibly different for
4384           each row) or `NULL`, if `o_nz` is used to specify the nonzero
4385           structure. The size of this array is equal to the number
4386           of local rows, i.e 'm'.
4387 
4388   Output Parameter:
4389 . A - the matrix
4390 
4391   Options Database Keys:
4392 + -mat_no_inode                     - Do not use inodes
4393 . -mat_inode_limit <limit>          - Sets inode limit (max limit=5)
4394 - -matmult_vecscatter_view <viewer> - View the vecscatter (i.e., communication pattern) used in `MatMult()` of sparse parallel matrices.
4395                                       See viewer types in manual of `MatView()`. Of them, ascii_matlab, draw or binary cause the `VecScatter`
4396                                       to be viewed as a matrix. Entry (i,j) is the size of message (in bytes) rank i sends to rank j in one `MatMult()` call.
4397 
4398   Level: intermediate
4399 
4400   Notes:
4401   It is recommended that one use `MatCreateFromOptions()` or the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
4402   MatXXXXSetPreallocation() paradigm instead of this routine directly.
4403   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
4404 
4405   If the *_nnz parameter is given then the *_nz parameter is ignored
4406 
4407   The `m`,`n`,`M`,`N` parameters specify the size of the matrix, and its partitioning across
4408   processors, while `d_nz`,`d_nnz`,`o_nz`,`o_nnz` parameters specify the approximate
4409   storage requirements for this matrix.
4410 
4411   If `PETSC_DECIDE` or  `PETSC_DETERMINE` is used for a particular argument on one
4412   processor than it must be used on all processors that share the object for
4413   that argument.
4414 
4415   If `m` and `n` are not `PETSC_DECIDE`, then the values determine the `PetscLayout` of the matrix and the ranges returned by
4416   `MatGetOwnershipRange()`, `MatGetOwnershipRanges()`, `MatGetOwnershipRangeColumn()`, and `MatGetOwnershipRangesColumn()`.
4417 
4418   The user MUST specify either the local or global matrix dimensions
4419   (possibly both).
4420 
4421   The parallel matrix is partitioned across processors such that the
4422   first `m0` rows belong to process 0, the next `m1` rows belong to
4423   process 1, the next `m2` rows belong to process 2, etc., where
4424   `m0`, `m1`, `m2`... are the input parameter `m` on each MPI process. I.e., each MPI process stores
4425   values corresponding to [m x N] submatrix.
4426 
4427   The columns are logically partitioned with the n0 columns belonging
4428   to 0th partition, the next n1 columns belonging to the next
4429   partition etc.. where n0,n1,n2... are the input parameter 'n'.
4430 
4431   The DIAGONAL portion of the local submatrix on any given processor
4432   is the submatrix corresponding to the rows and columns m,n
4433   corresponding to the given processor. i.e diagonal matrix on
4434   process 0 is [m0 x n0], diagonal matrix on process 1 is [m1 x n1]
4435   etc. The remaining portion of the local submatrix [m x (N-n)]
4436   constitute the OFF-DIAGONAL portion. The example below better
4437   illustrates this concept. The two matrices, the DIAGONAL portion and
4438   the OFF-DIAGONAL portion are each stored as `MATSEQAIJ` matrices.
4439 
4440   For a square global matrix we define each processor's diagonal portion
4441   to be its local rows and the corresponding columns (a square submatrix);
4442   each processor's off-diagonal portion encompasses the remainder of the
4443   local matrix (a rectangular submatrix).
4444 
4445   If `o_nnz`, `d_nnz` are specified, then `o_nz`, and `d_nz` are ignored.
4446 
4447   When calling this routine with a single process communicator, a matrix of
4448   type `MATSEQAIJ` is returned.  If a matrix of type `MATMPIAIJ` is desired for this
4449   type of communicator, use the construction mechanism
4450 .vb
4451   MatCreate(..., &A);
4452   MatSetType(A, MATMPIAIJ);
4453   MatSetSizes(A, m, n, M, N);
4454   MatMPIAIJSetPreallocation(A, ...);
4455 .ve
4456 
4457   By default, this format uses inodes (identical nodes) when possible.
4458   We search for consecutive rows with the same nonzero structure, thereby
4459   reusing matrix information to achieve increased efficiency.
4460 
4461   Example Usage:
4462   Consider the following 8x8 matrix with 34 non-zero values, that is
4463   assembled across 3 processors. Lets assume that proc0 owns 3 rows,
4464   proc1 owns 3 rows, proc2 owns 2 rows. This division can be shown
4465   as follows
4466 
4467 .vb
4468             1  2  0  |  0  3  0  |  0  4
4469     Proc0   0  5  6  |  7  0  0  |  8  0
4470             9  0 10  | 11  0  0  | 12  0
4471     -------------------------------------
4472            13  0 14  | 15 16 17  |  0  0
4473     Proc1   0 18  0  | 19 20 21  |  0  0
4474             0  0  0  | 22 23  0  | 24  0
4475     -------------------------------------
4476     Proc2  25 26 27  |  0  0 28  | 29  0
4477            30  0  0  | 31 32 33  |  0 34
4478 .ve
4479 
4480   This can be represented as a collection of submatrices as
4481 
4482 .vb
4483       A B C
4484       D E F
4485       G H I
4486 .ve
4487 
4488   Where the submatrices A,B,C are owned by proc0, D,E,F are
4489   owned by proc1, G,H,I are owned by proc2.
4490 
4491   The 'm' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4492   The 'n' parameters for proc0,proc1,proc2 are 3,3,2 respectively.
4493   The 'M','N' parameters are 8,8, and have the same values on all procs.
4494 
4495   The DIAGONAL submatrices corresponding to proc0,proc1,proc2 are
4496   submatrices [A], [E], [I] respectively. The OFF-DIAGONAL submatrices
4497   corresponding to proc0,proc1,proc2 are [BC], [DF], [GH] respectively.
4498   Internally, each processor stores the DIAGONAL part, and the OFF-DIAGONAL
4499   part as `MATSEQAIJ` matrices. For example, proc1 will store [E] as a `MATSEQAIJ`
4500   matrix, and [DF] as another SeqAIJ matrix.
4501 
4502   When `d_nz`, `o_nz` parameters are specified, `d_nz` storage elements are
4503   allocated for every row of the local DIAGONAL submatrix, and `o_nz`
4504   storage locations are allocated for every row of the OFF-DIAGONAL submatrix.
4505   One way to choose `d_nz` and `o_nz` is to use the maximum number of nonzeros over
4506   the local rows for each of the local DIAGONAL, and the OFF-DIAGONAL submatrices.
4507   In this case, the values of `d_nz`,`o_nz` are
4508 .vb
4509      proc0  dnz = 2, o_nz = 2
4510      proc1  dnz = 3, o_nz = 2
4511      proc2  dnz = 1, o_nz = 4
4512 .ve
4513   We are allocating m*(`d_nz`+`o_nz`) storage locations for every proc. This
4514   translates to 3*(2+2)=12 for proc0, 3*(3+2)=15 for proc1, 2*(1+4)=10
4515   for proc3. i.e we are using 12+15+10=37 storage locations to store
4516   34 values.
4517 
4518   When `d_nnz`, `o_nnz` parameters are specified, the storage is specified
4519   for every row, corresponding to both DIAGONAL and OFF-DIAGONAL submatrices.
4520   In the above case the values for d_nnz,o_nnz are
4521 .vb
4522      proc0 d_nnz = [2,2,2] and o_nnz = [2,2,2]
4523      proc1 d_nnz = [3,3,2] and o_nnz = [2,1,1]
4524      proc2 d_nnz = [1,1]   and o_nnz = [4,4]
4525 .ve
4526   Here the space allocated is sum of all the above values i.e 34, and
4527   hence pre-allocation is perfect.
4528 
4529 .seealso: [](ch_matrices), `Mat`, [Sparse Matrix Creation](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
4530           `MATMPIAIJ`, `MatCreateMPIAIJWithArrays()`, `MatGetOwnershipRange()`, `MatGetOwnershipRanges()`, `MatGetOwnershipRangeColumn()`,
4531           `MatGetOwnershipRangesColumn()`, `PetscLayout`
4532 @*/
4533 PetscErrorCode MatCreateAIJ(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
4534 {
4535   PetscMPIInt size;
4536 
4537   PetscFunctionBegin;
4538   PetscCall(MatCreate(comm, A));
4539   PetscCall(MatSetSizes(*A, m, n, M, N));
4540   PetscCallMPI(MPI_Comm_size(comm, &size));
4541   if (size > 1) {
4542     PetscCall(MatSetType(*A, MATMPIAIJ));
4543     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
4544   } else {
4545     PetscCall(MatSetType(*A, MATSEQAIJ));
4546     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
4547   }
4548   PetscFunctionReturn(PETSC_SUCCESS);
4549 }
4550 
4551 /*@C
4552   MatMPIAIJGetSeqAIJ - Returns the local pieces of this distributed matrix
4553 
4554   Not Collective
4555 
4556   Input Parameter:
4557 . A - The `MATMPIAIJ` matrix
4558 
4559   Output Parameters:
4560 + Ad     - The local diagonal block as a `MATSEQAIJ` matrix
4561 . Ao     - The local off-diagonal block as a `MATSEQAIJ` matrix
4562 - colmap - An array mapping local column numbers of `Ao` to global column numbers of the parallel matrix
4563 
4564   Level: intermediate
4565 
4566   Note:
4567   The rows in `Ad` and `Ao` are in [0, Nr), where Nr is the number of local rows on this process. The columns
4568   in `Ad` are in [0, Nc) where Nc is the number of local columns. The columns are `Ao` are in [0, Nco), where Nco is
4569   the number of nonzero columns in the local off-diagonal piece of the matrix `A`. The array colmap maps these
4570   local column numbers to global column numbers in the original matrix.
4571 
4572 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatMPIAIJGetLocalMat()`, `MatMPIAIJGetLocalMatCondensed()`, `MatCreateAIJ()`, `MATSEQAIJ`
4573 @*/
4574 PetscErrorCode MatMPIAIJGetSeqAIJ(Mat A, Mat *Ad, Mat *Ao, const PetscInt *colmap[])
4575 {
4576   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
4577   PetscBool   flg;
4578 
4579   PetscFunctionBegin;
4580   PetscCall(PetscStrbeginswith(((PetscObject)A)->type_name, MATMPIAIJ, &flg));
4581   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "This function requires a MATMPIAIJ matrix as input");
4582   if (Ad) *Ad = a->A;
4583   if (Ao) *Ao = a->B;
4584   if (colmap) *colmap = a->garray;
4585   PetscFunctionReturn(PETSC_SUCCESS);
4586 }
4587 
4588 PetscErrorCode MatCreateMPIMatConcatenateSeqMat_MPIAIJ(MPI_Comm comm, Mat inmat, PetscInt n, MatReuse scall, Mat *outmat)
4589 {
4590   PetscInt     m, N, i, rstart, nnz, Ii;
4591   PetscInt    *indx;
4592   PetscScalar *values;
4593   MatType      rootType;
4594 
4595   PetscFunctionBegin;
4596   PetscCall(MatGetSize(inmat, &m, &N));
4597   if (scall == MAT_INITIAL_MATRIX) { /* symbolic phase */
4598     PetscInt *dnz, *onz, sum, bs, cbs;
4599 
4600     if (n == PETSC_DECIDE) PetscCall(PetscSplitOwnership(comm, &n, &N));
4601     /* Check sum(n) = N */
4602     PetscCallMPI(MPIU_Allreduce(&n, &sum, 1, MPIU_INT, MPI_SUM, comm));
4603     PetscCheck(sum == N, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Sum of local columns %" PetscInt_FMT " != global columns %" PetscInt_FMT, sum, N);
4604 
4605     PetscCallMPI(MPI_Scan(&m, &rstart, 1, MPIU_INT, MPI_SUM, comm));
4606     rstart -= m;
4607 
4608     MatPreallocateBegin(comm, m, n, dnz, onz);
4609     for (i = 0; i < m; i++) {
4610       PetscCall(MatGetRow_SeqAIJ(inmat, i, &nnz, &indx, NULL));
4611       PetscCall(MatPreallocateSet(i + rstart, nnz, indx, dnz, onz));
4612       PetscCall(MatRestoreRow_SeqAIJ(inmat, i, &nnz, &indx, NULL));
4613     }
4614 
4615     PetscCall(MatCreate(comm, outmat));
4616     PetscCall(MatSetSizes(*outmat, m, n, PETSC_DETERMINE, PETSC_DETERMINE));
4617     PetscCall(MatGetBlockSizes(inmat, &bs, &cbs));
4618     PetscCall(MatSetBlockSizes(*outmat, bs, cbs));
4619     PetscCall(MatGetRootType_Private(inmat, &rootType));
4620     PetscCall(MatSetType(*outmat, rootType));
4621     PetscCall(MatSeqAIJSetPreallocation(*outmat, 0, dnz));
4622     PetscCall(MatMPIAIJSetPreallocation(*outmat, 0, dnz, 0, onz));
4623     MatPreallocateEnd(dnz, onz);
4624     PetscCall(MatSetOption(*outmat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
4625   }
4626 
4627   /* numeric phase */
4628   PetscCall(MatGetOwnershipRange(*outmat, &rstart, NULL));
4629   for (i = 0; i < m; i++) {
4630     PetscCall(MatGetRow_SeqAIJ(inmat, i, &nnz, &indx, &values));
4631     Ii = i + rstart;
4632     PetscCall(MatSetValues(*outmat, 1, &Ii, nnz, indx, values, INSERT_VALUES));
4633     PetscCall(MatRestoreRow_SeqAIJ(inmat, i, &nnz, &indx, &values));
4634   }
4635   PetscCall(MatAssemblyBegin(*outmat, MAT_FINAL_ASSEMBLY));
4636   PetscCall(MatAssemblyEnd(*outmat, MAT_FINAL_ASSEMBLY));
4637   PetscFunctionReturn(PETSC_SUCCESS);
4638 }
4639 
4640 static PetscErrorCode MatDestroy_MPIAIJ_SeqsToMPI(void **data)
4641 {
4642   Mat_Merge_SeqsToMPI *merge = (Mat_Merge_SeqsToMPI *)*data;
4643 
4644   PetscFunctionBegin;
4645   if (!merge) PetscFunctionReturn(PETSC_SUCCESS);
4646   PetscCall(PetscFree(merge->id_r));
4647   PetscCall(PetscFree(merge->len_s));
4648   PetscCall(PetscFree(merge->len_r));
4649   PetscCall(PetscFree(merge->bi));
4650   PetscCall(PetscFree(merge->bj));
4651   PetscCall(PetscFree(merge->buf_ri[0]));
4652   PetscCall(PetscFree(merge->buf_ri));
4653   PetscCall(PetscFree(merge->buf_rj[0]));
4654   PetscCall(PetscFree(merge->buf_rj));
4655   PetscCall(PetscFree(merge->coi));
4656   PetscCall(PetscFree(merge->coj));
4657   PetscCall(PetscFree(merge->owners_co));
4658   PetscCall(PetscLayoutDestroy(&merge->rowmap));
4659   PetscCall(PetscFree(merge));
4660   PetscFunctionReturn(PETSC_SUCCESS);
4661 }
4662 
4663 #include <../src/mat/utils/freespace.h>
4664 #include <petscbt.h>
4665 
4666 PetscErrorCode MatCreateMPIAIJSumSeqAIJNumeric(Mat seqmat, Mat mpimat)
4667 {
4668   MPI_Comm             comm;
4669   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)seqmat->data;
4670   PetscMPIInt          size, rank, taga, *len_s;
4671   PetscInt             N = mpimat->cmap->N, i, j, *owners, *ai = a->i, *aj, m;
4672   PetscMPIInt          proc, k;
4673   PetscInt           **buf_ri, **buf_rj;
4674   PetscInt             anzi, *bj_i, *bi, *bj, arow, bnzi, nextaj;
4675   PetscInt             nrows, **buf_ri_k, **nextrow, **nextai;
4676   MPI_Request         *s_waits, *r_waits;
4677   MPI_Status          *status;
4678   const MatScalar     *aa, *a_a;
4679   MatScalar          **abuf_r, *ba_i;
4680   Mat_Merge_SeqsToMPI *merge;
4681   PetscContainer       container;
4682 
4683   PetscFunctionBegin;
4684   PetscCall(PetscObjectGetComm((PetscObject)mpimat, &comm));
4685   PetscCall(PetscLogEventBegin(MAT_Seqstompinum, seqmat, 0, 0, 0));
4686 
4687   PetscCallMPI(MPI_Comm_size(comm, &size));
4688   PetscCallMPI(MPI_Comm_rank(comm, &rank));
4689 
4690   PetscCall(PetscObjectQuery((PetscObject)mpimat, "MatMergeSeqsToMPI", (PetscObject *)&container));
4691   PetscCheck(container, PetscObjectComm((PetscObject)mpimat), PETSC_ERR_PLIB, "Mat not created from MatCreateMPIAIJSumSeqAIJSymbolic");
4692   PetscCall(PetscContainerGetPointer(container, (void **)&merge));
4693   PetscCall(MatSeqAIJGetArrayRead(seqmat, &a_a));
4694   aa = a_a;
4695 
4696   bi     = merge->bi;
4697   bj     = merge->bj;
4698   buf_ri = merge->buf_ri;
4699   buf_rj = merge->buf_rj;
4700 
4701   PetscCall(PetscMalloc1(size, &status));
4702   owners = merge->rowmap->range;
4703   len_s  = merge->len_s;
4704 
4705   /* send and recv matrix values */
4706   PetscCall(PetscObjectGetNewTag((PetscObject)mpimat, &taga));
4707   PetscCall(PetscPostIrecvScalar(comm, taga, merge->nrecv, merge->id_r, merge->len_r, &abuf_r, &r_waits));
4708 
4709   PetscCall(PetscMalloc1(merge->nsend + 1, &s_waits));
4710   for (proc = 0, k = 0; proc < size; proc++) {
4711     if (!len_s[proc]) continue;
4712     i = owners[proc];
4713     PetscCallMPI(MPIU_Isend(aa + ai[i], len_s[proc], MPIU_MATSCALAR, proc, taga, comm, s_waits + k));
4714     k++;
4715   }
4716 
4717   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, r_waits, status));
4718   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, s_waits, status));
4719   PetscCall(PetscFree(status));
4720 
4721   PetscCall(PetscFree(s_waits));
4722   PetscCall(PetscFree(r_waits));
4723 
4724   /* insert mat values of mpimat */
4725   PetscCall(PetscMalloc1(N, &ba_i));
4726   PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextai));
4727 
4728   for (k = 0; k < merge->nrecv; k++) {
4729     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
4730     nrows       = *buf_ri_k[k];
4731     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
4732     nextai[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
4733   }
4734 
4735   /* set values of ba */
4736   m = merge->rowmap->n;
4737   for (i = 0; i < m; i++) {
4738     arow = owners[rank] + i;
4739     bj_i = bj + bi[i]; /* col indices of the i-th row of mpimat */
4740     bnzi = bi[i + 1] - bi[i];
4741     PetscCall(PetscArrayzero(ba_i, bnzi));
4742 
4743     /* add local non-zero vals of this proc's seqmat into ba */
4744     anzi   = ai[arow + 1] - ai[arow];
4745     aj     = a->j + ai[arow];
4746     aa     = a_a + ai[arow];
4747     nextaj = 0;
4748     for (j = 0; nextaj < anzi; j++) {
4749       if (*(bj_i + j) == aj[nextaj]) { /* bcol == acol */
4750         ba_i[j] += aa[nextaj++];
4751       }
4752     }
4753 
4754     /* add received vals into ba */
4755     for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
4756       /* i-th row */
4757       if (i == *nextrow[k]) {
4758         anzi   = *(nextai[k] + 1) - *nextai[k];
4759         aj     = buf_rj[k] + *nextai[k];
4760         aa     = abuf_r[k] + *nextai[k];
4761         nextaj = 0;
4762         for (j = 0; nextaj < anzi; j++) {
4763           if (*(bj_i + j) == aj[nextaj]) { /* bcol == acol */
4764             ba_i[j] += aa[nextaj++];
4765           }
4766         }
4767         nextrow[k]++;
4768         nextai[k]++;
4769       }
4770     }
4771     PetscCall(MatSetValues(mpimat, 1, &arow, bnzi, bj_i, ba_i, INSERT_VALUES));
4772   }
4773   PetscCall(MatSeqAIJRestoreArrayRead(seqmat, &a_a));
4774   PetscCall(MatAssemblyBegin(mpimat, MAT_FINAL_ASSEMBLY));
4775   PetscCall(MatAssemblyEnd(mpimat, MAT_FINAL_ASSEMBLY));
4776 
4777   PetscCall(PetscFree(abuf_r[0]));
4778   PetscCall(PetscFree(abuf_r));
4779   PetscCall(PetscFree(ba_i));
4780   PetscCall(PetscFree3(buf_ri_k, nextrow, nextai));
4781   PetscCall(PetscLogEventEnd(MAT_Seqstompinum, seqmat, 0, 0, 0));
4782   PetscFunctionReturn(PETSC_SUCCESS);
4783 }
4784 
4785 PetscErrorCode MatCreateMPIAIJSumSeqAIJSymbolic(MPI_Comm comm, Mat seqmat, PetscInt m, PetscInt n, Mat *mpimat)
4786 {
4787   Mat                  B_mpi;
4788   Mat_SeqAIJ          *a = (Mat_SeqAIJ *)seqmat->data;
4789   PetscMPIInt          size, rank, tagi, tagj, *len_s, *len_si, *len_ri;
4790   PetscInt           **buf_rj, **buf_ri, **buf_ri_k;
4791   PetscInt             M = seqmat->rmap->n, N = seqmat->cmap->n, i, *owners, *ai = a->i, *aj = a->j;
4792   PetscInt             len, *dnz, *onz, bs, cbs;
4793   PetscInt             k, anzi, *bi, *bj, *lnk, nlnk, arow, bnzi;
4794   PetscInt             nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextai;
4795   MPI_Request         *si_waits, *sj_waits, *ri_waits, *rj_waits;
4796   MPI_Status          *status;
4797   PetscFreeSpaceList   free_space = NULL, current_space = NULL;
4798   PetscBT              lnkbt;
4799   Mat_Merge_SeqsToMPI *merge;
4800   PetscContainer       container;
4801 
4802   PetscFunctionBegin;
4803   PetscCall(PetscLogEventBegin(MAT_Seqstompisym, seqmat, 0, 0, 0));
4804 
4805   /* make sure it is a PETSc comm */
4806   PetscCall(PetscCommDuplicate(comm, &comm, NULL));
4807   PetscCallMPI(MPI_Comm_size(comm, &size));
4808   PetscCallMPI(MPI_Comm_rank(comm, &rank));
4809 
4810   PetscCall(PetscNew(&merge));
4811   PetscCall(PetscMalloc1(size, &status));
4812 
4813   /* determine row ownership */
4814   PetscCall(PetscLayoutCreate(comm, &merge->rowmap));
4815   PetscCall(PetscLayoutSetLocalSize(merge->rowmap, m));
4816   PetscCall(PetscLayoutSetSize(merge->rowmap, M));
4817   PetscCall(PetscLayoutSetBlockSize(merge->rowmap, 1));
4818   PetscCall(PetscLayoutSetUp(merge->rowmap));
4819   PetscCall(PetscMalloc1(size, &len_si));
4820   PetscCall(PetscMalloc1(size, &merge->len_s));
4821 
4822   m      = merge->rowmap->n;
4823   owners = merge->rowmap->range;
4824 
4825   /* determine the number of messages to send, their lengths */
4826   len_s = merge->len_s;
4827 
4828   len          = 0; /* length of buf_si[] */
4829   merge->nsend = 0;
4830   for (PetscMPIInt proc = 0; proc < size; proc++) {
4831     len_si[proc] = 0;
4832     if (proc == rank) {
4833       len_s[proc] = 0;
4834     } else {
4835       PetscCall(PetscMPIIntCast(owners[proc + 1] - owners[proc] + 1, &len_si[proc]));
4836       PetscCall(PetscMPIIntCast(ai[owners[proc + 1]] - ai[owners[proc]], &len_s[proc])); /* num of rows to be sent to [proc] */
4837     }
4838     if (len_s[proc]) {
4839       merge->nsend++;
4840       nrows = 0;
4841       for (i = owners[proc]; i < owners[proc + 1]; i++) {
4842         if (ai[i + 1] > ai[i]) nrows++;
4843       }
4844       PetscCall(PetscMPIIntCast(2 * (nrows + 1), &len_si[proc]));
4845       len += len_si[proc];
4846     }
4847   }
4848 
4849   /* determine the number and length of messages to receive for ij-structure */
4850   PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &merge->nrecv));
4851   PetscCall(PetscGatherMessageLengths2(comm, merge->nsend, merge->nrecv, len_s, len_si, &merge->id_r, &merge->len_r, &len_ri));
4852 
4853   /* post the Irecv of j-structure */
4854   PetscCall(PetscCommGetNewTag(comm, &tagj));
4855   PetscCall(PetscPostIrecvInt(comm, tagj, merge->nrecv, merge->id_r, merge->len_r, &buf_rj, &rj_waits));
4856 
4857   /* post the Isend of j-structure */
4858   PetscCall(PetscMalloc2(merge->nsend, &si_waits, merge->nsend, &sj_waits));
4859 
4860   for (PetscMPIInt proc = 0, k = 0; proc < size; proc++) {
4861     if (!len_s[proc]) continue;
4862     i = owners[proc];
4863     PetscCallMPI(MPIU_Isend(aj + ai[i], len_s[proc], MPIU_INT, proc, tagj, comm, sj_waits + k));
4864     k++;
4865   }
4866 
4867   /* receives and sends of j-structure are complete */
4868   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, rj_waits, status));
4869   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, sj_waits, status));
4870 
4871   /* send and recv i-structure */
4872   PetscCall(PetscCommGetNewTag(comm, &tagi));
4873   PetscCall(PetscPostIrecvInt(comm, tagi, merge->nrecv, merge->id_r, len_ri, &buf_ri, &ri_waits));
4874 
4875   PetscCall(PetscMalloc1(len + 1, &buf_s));
4876   buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
4877   for (PetscMPIInt proc = 0, k = 0; proc < size; proc++) {
4878     if (!len_s[proc]) continue;
4879     /* form outgoing message for i-structure:
4880          buf_si[0]:                 nrows to be sent
4881                [1:nrows]:           row index (global)
4882                [nrows+1:2*nrows+1]: i-structure index
4883     */
4884     nrows       = len_si[proc] / 2 - 1;
4885     buf_si_i    = buf_si + nrows + 1;
4886     buf_si[0]   = nrows;
4887     buf_si_i[0] = 0;
4888     nrows       = 0;
4889     for (i = owners[proc]; i < owners[proc + 1]; i++) {
4890       anzi = ai[i + 1] - ai[i];
4891       if (anzi) {
4892         buf_si_i[nrows + 1] = buf_si_i[nrows] + anzi; /* i-structure */
4893         buf_si[nrows + 1]   = i - owners[proc];       /* local row index */
4894         nrows++;
4895       }
4896     }
4897     PetscCallMPI(MPIU_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, si_waits + k));
4898     k++;
4899     buf_si += len_si[proc];
4900   }
4901 
4902   if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, ri_waits, status));
4903   if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, si_waits, status));
4904 
4905   PetscCall(PetscInfo(seqmat, "nsend: %d, nrecv: %d\n", merge->nsend, merge->nrecv));
4906   for (i = 0; i < merge->nrecv; i++) PetscCall(PetscInfo(seqmat, "recv len_ri=%d, len_rj=%d from [%d]\n", len_ri[i], merge->len_r[i], merge->id_r[i]));
4907 
4908   PetscCall(PetscFree(len_si));
4909   PetscCall(PetscFree(len_ri));
4910   PetscCall(PetscFree(rj_waits));
4911   PetscCall(PetscFree2(si_waits, sj_waits));
4912   PetscCall(PetscFree(ri_waits));
4913   PetscCall(PetscFree(buf_s));
4914   PetscCall(PetscFree(status));
4915 
4916   /* compute a local seq matrix in each processor */
4917   /* allocate bi array and free space for accumulating nonzero column info */
4918   PetscCall(PetscMalloc1(m + 1, &bi));
4919   bi[0] = 0;
4920 
4921   /* create and initialize a linked list */
4922   nlnk = N + 1;
4923   PetscCall(PetscLLCreate(N, N, nlnk, lnk, lnkbt));
4924 
4925   /* initial FreeSpace size is 2*(num of local nnz(seqmat)) */
4926   len = ai[owners[rank + 1]] - ai[owners[rank]];
4927   PetscCall(PetscFreeSpaceGet(PetscIntMultTruncate(2, len) + 1, &free_space));
4928 
4929   current_space = free_space;
4930 
4931   /* determine symbolic info for each local row */
4932   PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextai));
4933 
4934   for (k = 0; k < merge->nrecv; k++) {
4935     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
4936     nrows       = *buf_ri_k[k];
4937     nextrow[k]  = buf_ri_k[k] + 1;           /* next row number of k-th recved i-structure */
4938     nextai[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
4939   }
4940 
4941   MatPreallocateBegin(comm, m, n, dnz, onz);
4942   len = 0;
4943   for (i = 0; i < m; i++) {
4944     bnzi = 0;
4945     /* add local non-zero cols of this proc's seqmat into lnk */
4946     arow = owners[rank] + i;
4947     anzi = ai[arow + 1] - ai[arow];
4948     aj   = a->j + ai[arow];
4949     PetscCall(PetscLLAddSorted(anzi, aj, N, &nlnk, lnk, lnkbt));
4950     bnzi += nlnk;
4951     /* add received col data into lnk */
4952     for (k = 0; k < merge->nrecv; k++) { /* k-th received message */
4953       if (i == *nextrow[k]) {            /* i-th row */
4954         anzi = *(nextai[k] + 1) - *nextai[k];
4955         aj   = buf_rj[k] + *nextai[k];
4956         PetscCall(PetscLLAddSorted(anzi, aj, N, &nlnk, lnk, lnkbt));
4957         bnzi += nlnk;
4958         nextrow[k]++;
4959         nextai[k]++;
4960       }
4961     }
4962     if (len < bnzi) len = bnzi; /* =max(bnzi) */
4963 
4964     /* if free space is not available, make more free space */
4965     if (current_space->local_remaining < bnzi) PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(bnzi, current_space->total_array_size), &current_space));
4966     /* copy data into free space, then initialize lnk */
4967     PetscCall(PetscLLClean(N, N, bnzi, lnk, current_space->array, lnkbt));
4968     PetscCall(MatPreallocateSet(i + owners[rank], bnzi, current_space->array, dnz, onz));
4969 
4970     current_space->array += bnzi;
4971     current_space->local_used += bnzi;
4972     current_space->local_remaining -= bnzi;
4973 
4974     bi[i + 1] = bi[i] + bnzi;
4975   }
4976 
4977   PetscCall(PetscFree3(buf_ri_k, nextrow, nextai));
4978 
4979   PetscCall(PetscMalloc1(bi[m] + 1, &bj));
4980   PetscCall(PetscFreeSpaceContiguous(&free_space, bj));
4981   PetscCall(PetscLLDestroy(lnk, lnkbt));
4982 
4983   /* create symbolic parallel matrix B_mpi */
4984   PetscCall(MatGetBlockSizes(seqmat, &bs, &cbs));
4985   PetscCall(MatCreate(comm, &B_mpi));
4986   if (n == PETSC_DECIDE) {
4987     PetscCall(MatSetSizes(B_mpi, m, n, PETSC_DETERMINE, N));
4988   } else {
4989     PetscCall(MatSetSizes(B_mpi, m, n, PETSC_DETERMINE, PETSC_DETERMINE));
4990   }
4991   PetscCall(MatSetBlockSizes(B_mpi, bs, cbs));
4992   PetscCall(MatSetType(B_mpi, MATMPIAIJ));
4993   PetscCall(MatMPIAIJSetPreallocation(B_mpi, 0, dnz, 0, onz));
4994   MatPreallocateEnd(dnz, onz);
4995   PetscCall(MatSetOption(B_mpi, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE));
4996 
4997   /* B_mpi is not ready for use - assembly will be done by MatCreateMPIAIJSumSeqAIJNumeric() */
4998   B_mpi->assembled = PETSC_FALSE;
4999   merge->bi        = bi;
5000   merge->bj        = bj;
5001   merge->buf_ri    = buf_ri;
5002   merge->buf_rj    = buf_rj;
5003   merge->coi       = NULL;
5004   merge->coj       = NULL;
5005   merge->owners_co = NULL;
5006 
5007   PetscCall(PetscCommDestroy(&comm));
5008 
5009   /* attach the supporting struct to B_mpi for reuse */
5010   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container));
5011   PetscCall(PetscContainerSetPointer(container, merge));
5012   PetscCall(PetscContainerSetCtxDestroy(container, MatDestroy_MPIAIJ_SeqsToMPI));
5013   PetscCall(PetscObjectCompose((PetscObject)B_mpi, "MatMergeSeqsToMPI", (PetscObject)container));
5014   PetscCall(PetscContainerDestroy(&container));
5015   *mpimat = B_mpi;
5016 
5017   PetscCall(PetscLogEventEnd(MAT_Seqstompisym, seqmat, 0, 0, 0));
5018   PetscFunctionReturn(PETSC_SUCCESS);
5019 }
5020 
5021 /*@
5022   MatCreateMPIAIJSumSeqAIJ - Creates a `MATMPIAIJ` matrix by adding sequential
5023   matrices from each processor
5024 
5025   Collective
5026 
5027   Input Parameters:
5028 + comm   - the communicators the parallel matrix will live on
5029 . seqmat - the input sequential matrices
5030 . m      - number of local rows (or `PETSC_DECIDE`)
5031 . n      - number of local columns (or `PETSC_DECIDE`)
5032 - scall  - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5033 
5034   Output Parameter:
5035 . mpimat - the parallel matrix generated
5036 
5037   Level: advanced
5038 
5039   Note:
5040   The dimensions of the sequential matrix in each processor MUST be the same.
5041   The input seqmat is included into the container "Mat_Merge_SeqsToMPI", and will be
5042   destroyed when `mpimat` is destroyed. Call `PetscObjectQuery()` to access `seqmat`.
5043 
5044 .seealso: [](ch_matrices), `Mat`, `MatCreateAIJ()`
5045 @*/
5046 PetscErrorCode MatCreateMPIAIJSumSeqAIJ(MPI_Comm comm, Mat seqmat, PetscInt m, PetscInt n, MatReuse scall, Mat *mpimat)
5047 {
5048   PetscMPIInt size;
5049 
5050   PetscFunctionBegin;
5051   PetscCallMPI(MPI_Comm_size(comm, &size));
5052   if (size == 1) {
5053     PetscCall(PetscLogEventBegin(MAT_Seqstompi, seqmat, 0, 0, 0));
5054     if (scall == MAT_INITIAL_MATRIX) {
5055       PetscCall(MatDuplicate(seqmat, MAT_COPY_VALUES, mpimat));
5056     } else {
5057       PetscCall(MatCopy(seqmat, *mpimat, SAME_NONZERO_PATTERN));
5058     }
5059     PetscCall(PetscLogEventEnd(MAT_Seqstompi, seqmat, 0, 0, 0));
5060     PetscFunctionReturn(PETSC_SUCCESS);
5061   }
5062   PetscCall(PetscLogEventBegin(MAT_Seqstompi, seqmat, 0, 0, 0));
5063   if (scall == MAT_INITIAL_MATRIX) PetscCall(MatCreateMPIAIJSumSeqAIJSymbolic(comm, seqmat, m, n, mpimat));
5064   PetscCall(MatCreateMPIAIJSumSeqAIJNumeric(seqmat, *mpimat));
5065   PetscCall(PetscLogEventEnd(MAT_Seqstompi, seqmat, 0, 0, 0));
5066   PetscFunctionReturn(PETSC_SUCCESS);
5067 }
5068 
5069 /*@
5070   MatAIJGetLocalMat - Creates a `MATSEQAIJ` from a `MATAIJ` matrix.
5071 
5072   Not Collective
5073 
5074   Input Parameter:
5075 . A - the matrix
5076 
5077   Output Parameter:
5078 . A_loc - the local sequential matrix generated
5079 
5080   Level: developer
5081 
5082   Notes:
5083   The matrix is created by taking `A`'s local rows and putting them into a sequential matrix
5084   with `mlocal` rows and `n` columns. Where `mlocal` is obtained with `MatGetLocalSize()` and
5085   `n` is the global column count obtained with `MatGetSize()`
5086 
5087   In other words combines the two parts of a parallel `MATMPIAIJ` matrix on each process to a single matrix.
5088 
5089   For parallel matrices this creates an entirely new matrix. If the matrix is sequential it merely increases the reference count.
5090 
5091   Destroy the matrix with `MatDestroy()`
5092 
5093 .seealso: [](ch_matrices), `Mat`, `MatMPIAIJGetLocalMat()`
5094 @*/
5095 PetscErrorCode MatAIJGetLocalMat(Mat A, Mat *A_loc)
5096 {
5097   PetscBool mpi;
5098 
5099   PetscFunctionBegin;
5100   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIAIJ, &mpi));
5101   if (mpi) {
5102     PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, A_loc));
5103   } else {
5104     *A_loc = A;
5105     PetscCall(PetscObjectReference((PetscObject)*A_loc));
5106   }
5107   PetscFunctionReturn(PETSC_SUCCESS);
5108 }
5109 
5110 /*@
5111   MatMPIAIJGetLocalMat - Creates a `MATSEQAIJ` from a `MATMPIAIJ` matrix.
5112 
5113   Not Collective
5114 
5115   Input Parameters:
5116 + A     - the matrix
5117 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5118 
5119   Output Parameter:
5120 . A_loc - the local sequential matrix generated
5121 
5122   Level: developer
5123 
5124   Notes:
5125   The matrix is created by taking all `A`'s local rows and putting them into a sequential
5126   matrix with `mlocal` rows and `n` columns.`mlocal` is the row count obtained with
5127   `MatGetLocalSize()` and `n` is the global column count obtained with `MatGetSize()`.
5128 
5129   In other words combines the two parts of a parallel `MATMPIAIJ` matrix on each process to a single matrix.
5130 
5131   When `A` is sequential and `MAT_INITIAL_MATRIX` is requested, the matrix returned is the diagonal part of `A` (which contains the entire matrix),
5132   with its reference count increased by one. Hence changing values of `A_loc` changes `A`. If `MAT_REUSE_MATRIX` is requested on a sequential matrix
5133   then `MatCopy`(Adiag,*`A_loc`,`SAME_NONZERO_PATTERN`) is called to fill `A_loc`. Thus one can preallocate the appropriate sequential matrix `A_loc`
5134   and then call this routine with `MAT_REUSE_MATRIX`. In this case, one can modify the values of `A_loc` without affecting the original sequential matrix.
5135 
5136 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMatCondensed()`, `MatMPIAIJGetLocalMatMerge()`
5137 @*/
5138 PetscErrorCode MatMPIAIJGetLocalMat(Mat A, MatReuse scall, Mat *A_loc)
5139 {
5140   Mat_MPIAIJ        *mpimat = (Mat_MPIAIJ *)A->data;
5141   Mat_SeqAIJ        *mat, *a, *b;
5142   PetscInt          *ai, *aj, *bi, *bj, *cmap = mpimat->garray;
5143   const PetscScalar *aa, *ba, *aav, *bav;
5144   PetscScalar       *ca, *cam;
5145   PetscMPIInt        size;
5146   PetscInt           am = A->rmap->n, i, j, k, cstart = A->cmap->rstart;
5147   PetscInt          *ci, *cj, col, ncols_d, ncols_o, jo;
5148   PetscBool          match;
5149 
5150   PetscFunctionBegin;
5151   PetscCall(PetscStrbeginswith(((PetscObject)A)->type_name, MATMPIAIJ, &match));
5152   PetscCheck(match, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Requires MATMPIAIJ matrix as input");
5153   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
5154   if (size == 1) {
5155     if (scall == MAT_INITIAL_MATRIX) {
5156       PetscCall(PetscObjectReference((PetscObject)mpimat->A));
5157       *A_loc = mpimat->A;
5158     } else if (scall == MAT_REUSE_MATRIX) {
5159       PetscCall(MatCopy(mpimat->A, *A_loc, SAME_NONZERO_PATTERN));
5160     }
5161     PetscFunctionReturn(PETSC_SUCCESS);
5162   }
5163 
5164   PetscCall(PetscLogEventBegin(MAT_Getlocalmat, A, 0, 0, 0));
5165   a  = (Mat_SeqAIJ *)mpimat->A->data;
5166   b  = (Mat_SeqAIJ *)mpimat->B->data;
5167   ai = a->i;
5168   aj = a->j;
5169   bi = b->i;
5170   bj = b->j;
5171   PetscCall(MatSeqAIJGetArrayRead(mpimat->A, &aav));
5172   PetscCall(MatSeqAIJGetArrayRead(mpimat->B, &bav));
5173   aa = aav;
5174   ba = bav;
5175   if (scall == MAT_INITIAL_MATRIX) {
5176     PetscCall(PetscMalloc1(1 + am, &ci));
5177     ci[0] = 0;
5178     for (i = 0; i < am; i++) ci[i + 1] = ci[i] + (ai[i + 1] - ai[i]) + (bi[i + 1] - bi[i]);
5179     PetscCall(PetscMalloc1(1 + ci[am], &cj));
5180     PetscCall(PetscMalloc1(1 + ci[am], &ca));
5181     k = 0;
5182     for (i = 0; i < am; i++) {
5183       ncols_o = bi[i + 1] - bi[i];
5184       ncols_d = ai[i + 1] - ai[i];
5185       /* off-diagonal portion of A */
5186       for (jo = 0; jo < ncols_o; jo++) {
5187         col = cmap[*bj];
5188         if (col >= cstart) break;
5189         cj[k] = col;
5190         bj++;
5191         ca[k++] = *ba++;
5192       }
5193       /* diagonal portion of A */
5194       for (j = 0; j < ncols_d; j++) {
5195         cj[k]   = cstart + *aj++;
5196         ca[k++] = *aa++;
5197       }
5198       /* off-diagonal portion of A */
5199       for (j = jo; j < ncols_o; j++) {
5200         cj[k]   = cmap[*bj++];
5201         ca[k++] = *ba++;
5202       }
5203     }
5204     /* put together the new matrix */
5205     PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, A->cmap->N, ci, cj, ca, A_loc));
5206     /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5207     /* Since these are PETSc arrays, change flags to free them as necessary. */
5208     mat          = (Mat_SeqAIJ *)(*A_loc)->data;
5209     mat->free_a  = PETSC_TRUE;
5210     mat->free_ij = PETSC_TRUE;
5211     mat->nonew   = 0;
5212   } else if (scall == MAT_REUSE_MATRIX) {
5213     mat = (Mat_SeqAIJ *)(*A_loc)->data;
5214     ci  = mat->i;
5215     cj  = mat->j;
5216     PetscCall(MatSeqAIJGetArrayWrite(*A_loc, &cam));
5217     for (i = 0; i < am; i++) {
5218       /* off-diagonal portion of A */
5219       ncols_o = bi[i + 1] - bi[i];
5220       for (jo = 0; jo < ncols_o; jo++) {
5221         col = cmap[*bj];
5222         if (col >= cstart) break;
5223         *cam++ = *ba++;
5224         bj++;
5225       }
5226       /* diagonal portion of A */
5227       ncols_d = ai[i + 1] - ai[i];
5228       for (j = 0; j < ncols_d; j++) *cam++ = *aa++;
5229       /* off-diagonal portion of A */
5230       for (j = jo; j < ncols_o; j++) {
5231         *cam++ = *ba++;
5232         bj++;
5233       }
5234     }
5235     PetscCall(MatSeqAIJRestoreArrayWrite(*A_loc, &cam));
5236   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid MatReuse %d", (int)scall);
5237   PetscCall(MatSeqAIJRestoreArrayRead(mpimat->A, &aav));
5238   PetscCall(MatSeqAIJRestoreArrayRead(mpimat->B, &bav));
5239   PetscCall(PetscLogEventEnd(MAT_Getlocalmat, A, 0, 0, 0));
5240   PetscFunctionReturn(PETSC_SUCCESS);
5241 }
5242 
5243 /*@
5244   MatMPIAIJGetLocalMatMerge - Creates a `MATSEQAIJ` from a `MATMPIAIJ` matrix by taking all its local rows and putting them into a sequential matrix with
5245   mlocal rows and n columns. Where n is the sum of the number of columns of the diagonal and off-diagonal part
5246 
5247   Not Collective
5248 
5249   Input Parameters:
5250 + A     - the matrix
5251 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5252 
5253   Output Parameters:
5254 + glob  - sequential `IS` with global indices associated with the columns of the local sequential matrix generated (can be `NULL`)
5255 - A_loc - the local sequential matrix generated
5256 
5257   Level: developer
5258 
5259   Note:
5260   This is different from `MatMPIAIJGetLocalMat()` since the first columns in the returning matrix are those associated with the diagonal
5261   part, then those associated with the off-diagonal part (in its local ordering)
5262 
5263 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMat()`, `MatMPIAIJGetLocalMatCondensed()`
5264 @*/
5265 PetscErrorCode MatMPIAIJGetLocalMatMerge(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
5266 {
5267   Mat             Ao, Ad;
5268   const PetscInt *cmap;
5269   PetscMPIInt     size;
5270   PetscErrorCode (*f)(Mat, MatReuse, IS *, Mat *);
5271 
5272   PetscFunctionBegin;
5273   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
5274   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
5275   if (size == 1) {
5276     if (scall == MAT_INITIAL_MATRIX) {
5277       PetscCall(PetscObjectReference((PetscObject)Ad));
5278       *A_loc = Ad;
5279     } else if (scall == MAT_REUSE_MATRIX) {
5280       PetscCall(MatCopy(Ad, *A_loc, SAME_NONZERO_PATTERN));
5281     }
5282     if (glob) PetscCall(ISCreateStride(PetscObjectComm((PetscObject)Ad), Ad->cmap->n, Ad->cmap->rstart, 1, glob));
5283     PetscFunctionReturn(PETSC_SUCCESS);
5284   }
5285   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", &f));
5286   PetscCall(PetscLogEventBegin(MAT_Getlocalmat, A, 0, 0, 0));
5287   if (f) {
5288     PetscCall((*f)(A, scall, glob, A_loc));
5289   } else {
5290     Mat_SeqAIJ        *a = (Mat_SeqAIJ *)Ad->data;
5291     Mat_SeqAIJ        *b = (Mat_SeqAIJ *)Ao->data;
5292     Mat_SeqAIJ        *c;
5293     PetscInt          *ai = a->i, *aj = a->j;
5294     PetscInt          *bi = b->i, *bj = b->j;
5295     PetscInt          *ci, *cj;
5296     const PetscScalar *aa, *ba;
5297     PetscScalar       *ca;
5298     PetscInt           i, j, am, dn, on;
5299 
5300     PetscCall(MatGetLocalSize(Ad, &am, &dn));
5301     PetscCall(MatGetLocalSize(Ao, NULL, &on));
5302     PetscCall(MatSeqAIJGetArrayRead(Ad, &aa));
5303     PetscCall(MatSeqAIJGetArrayRead(Ao, &ba));
5304     if (scall == MAT_INITIAL_MATRIX) {
5305       PetscInt k;
5306       PetscCall(PetscMalloc1(1 + am, &ci));
5307       PetscCall(PetscMalloc1(ai[am] + bi[am], &cj));
5308       PetscCall(PetscMalloc1(ai[am] + bi[am], &ca));
5309       ci[0] = 0;
5310       for (i = 0, k = 0; i < am; i++) {
5311         const PetscInt ncols_o = bi[i + 1] - bi[i];
5312         const PetscInt ncols_d = ai[i + 1] - ai[i];
5313         ci[i + 1]              = ci[i] + ncols_o + ncols_d;
5314         /* diagonal portion of A */
5315         for (j = 0; j < ncols_d; j++, k++) {
5316           cj[k] = *aj++;
5317           ca[k] = *aa++;
5318         }
5319         /* off-diagonal portion of A */
5320         for (j = 0; j < ncols_o; j++, k++) {
5321           cj[k] = dn + *bj++;
5322           ca[k] = *ba++;
5323         }
5324       }
5325       /* put together the new matrix */
5326       PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, am, dn + on, ci, cj, ca, A_loc));
5327       /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5328       /* Since these are PETSc arrays, change flags to free them as necessary. */
5329       c          = (Mat_SeqAIJ *)(*A_loc)->data;
5330       c->free_a  = PETSC_TRUE;
5331       c->free_ij = PETSC_TRUE;
5332       c->nonew   = 0;
5333       PetscCall(MatSetType(*A_loc, ((PetscObject)Ad)->type_name));
5334     } else if (scall == MAT_REUSE_MATRIX) {
5335       PetscCall(MatSeqAIJGetArrayWrite(*A_loc, &ca));
5336       for (i = 0; i < am; i++) {
5337         const PetscInt ncols_d = ai[i + 1] - ai[i];
5338         const PetscInt ncols_o = bi[i + 1] - bi[i];
5339         /* diagonal portion of A */
5340         for (j = 0; j < ncols_d; j++) *ca++ = *aa++;
5341         /* off-diagonal portion of A */
5342         for (j = 0; j < ncols_o; j++) *ca++ = *ba++;
5343       }
5344       PetscCall(MatSeqAIJRestoreArrayWrite(*A_loc, &ca));
5345     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid MatReuse %d", (int)scall);
5346     PetscCall(MatSeqAIJRestoreArrayRead(Ad, &aa));
5347     PetscCall(MatSeqAIJRestoreArrayRead(Ao, &aa));
5348     if (glob) {
5349       PetscInt cst, *gidx;
5350 
5351       PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
5352       PetscCall(PetscMalloc1(dn + on, &gidx));
5353       for (i = 0; i < dn; i++) gidx[i] = cst + i;
5354       for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
5355       PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
5356     }
5357   }
5358   PetscCall(PetscLogEventEnd(MAT_Getlocalmat, A, 0, 0, 0));
5359   PetscFunctionReturn(PETSC_SUCCESS);
5360 }
5361 
5362 /*@C
5363   MatMPIAIJGetLocalMatCondensed - Creates a `MATSEQAIJ` matrix from an `MATMPIAIJ` matrix by taking all its local rows and NON-ZERO columns
5364 
5365   Not Collective
5366 
5367   Input Parameters:
5368 + A     - the matrix
5369 . scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5370 . row   - index set of rows to extract (or `NULL`)
5371 - col   - index set of columns to extract (or `NULL`)
5372 
5373   Output Parameter:
5374 . A_loc - the local sequential matrix generated
5375 
5376   Level: developer
5377 
5378 .seealso: [](ch_matrices), `Mat`, `MATMPIAIJ`, `MatGetOwnershipRange()`, `MatMPIAIJGetLocalMat()`
5379 @*/
5380 PetscErrorCode MatMPIAIJGetLocalMatCondensed(Mat A, MatReuse scall, IS *row, IS *col, Mat *A_loc)
5381 {
5382   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
5383   PetscInt    i, start, end, ncols, nzA, nzB, *cmap, imark, *idx;
5384   IS          isrowa, iscola;
5385   Mat        *aloc;
5386   PetscBool   match;
5387 
5388   PetscFunctionBegin;
5389   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATMPIAIJ, &match));
5390   PetscCheck(match, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Requires MATMPIAIJ matrix as input");
5391   PetscCall(PetscLogEventBegin(MAT_Getlocalmatcondensed, A, 0, 0, 0));
5392   if (!row) {
5393     start = A->rmap->rstart;
5394     end   = A->rmap->rend;
5395     PetscCall(ISCreateStride(PETSC_COMM_SELF, end - start, start, 1, &isrowa));
5396   } else {
5397     isrowa = *row;
5398   }
5399   if (!col) {
5400     start = A->cmap->rstart;
5401     cmap  = a->garray;
5402     nzA   = a->A->cmap->n;
5403     nzB   = a->B->cmap->n;
5404     PetscCall(PetscMalloc1(nzA + nzB, &idx));
5405     ncols = 0;
5406     for (i = 0; i < nzB; i++) {
5407       if (cmap[i] < start) idx[ncols++] = cmap[i];
5408       else break;
5409     }
5410     imark = i;
5411     for (i = 0; i < nzA; i++) idx[ncols++] = start + i;
5412     for (i = imark; i < nzB; i++) idx[ncols++] = cmap[i];
5413     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, &iscola));
5414   } else {
5415     iscola = *col;
5416   }
5417   if (scall != MAT_INITIAL_MATRIX) {
5418     PetscCall(PetscMalloc1(1, &aloc));
5419     aloc[0] = *A_loc;
5420   }
5421   PetscCall(MatCreateSubMatrices(A, 1, &isrowa, &iscola, scall, &aloc));
5422   if (!col) { /* attach global id of condensed columns */
5423     PetscCall(PetscObjectCompose((PetscObject)aloc[0], "_petsc_GetLocalMatCondensed_iscol", (PetscObject)iscola));
5424   }
5425   *A_loc = aloc[0];
5426   PetscCall(PetscFree(aloc));
5427   if (!row) PetscCall(ISDestroy(&isrowa));
5428   if (!col) PetscCall(ISDestroy(&iscola));
5429   PetscCall(PetscLogEventEnd(MAT_Getlocalmatcondensed, A, 0, 0, 0));
5430   PetscFunctionReturn(PETSC_SUCCESS);
5431 }
5432 
5433 /*
5434  * Create a sequential AIJ matrix based on row indices. a whole column is extracted once a row is matched.
5435  * Row could be local or remote.The routine is designed to be scalable in memory so that nothing is based
5436  * on a global size.
5437  * */
5438 static PetscErrorCode MatCreateSeqSubMatrixWithRows_Private(Mat P, IS rows, Mat *P_oth)
5439 {
5440   Mat_MPIAIJ            *p  = (Mat_MPIAIJ *)P->data;
5441   Mat_SeqAIJ            *pd = (Mat_SeqAIJ *)p->A->data, *po = (Mat_SeqAIJ *)p->B->data, *p_oth;
5442   PetscInt               plocalsize, nrows, *ilocal, *oilocal, i, lidx, *nrcols, *nlcols, ncol;
5443   PetscMPIInt            owner;
5444   PetscSFNode           *iremote, *oiremote;
5445   const PetscInt        *lrowindices;
5446   PetscSF                sf, osf;
5447   PetscInt               pcstart, *roffsets, *loffsets, *pnnz, j;
5448   PetscInt               ontotalcols, dntotalcols, ntotalcols, nout;
5449   MPI_Comm               comm;
5450   ISLocalToGlobalMapping mapping;
5451   const PetscScalar     *pd_a, *po_a;
5452 
5453   PetscFunctionBegin;
5454   PetscCall(PetscObjectGetComm((PetscObject)P, &comm));
5455   /* plocalsize is the number of roots
5456    * nrows is the number of leaves
5457    * */
5458   PetscCall(MatGetLocalSize(P, &plocalsize, NULL));
5459   PetscCall(ISGetLocalSize(rows, &nrows));
5460   PetscCall(PetscCalloc1(nrows, &iremote));
5461   PetscCall(ISGetIndices(rows, &lrowindices));
5462   for (i = 0; i < nrows; i++) {
5463     /* Find a remote index and an owner for a row
5464      * The row could be local or remote
5465      * */
5466     owner = 0;
5467     lidx  = 0;
5468     PetscCall(PetscLayoutFindOwnerIndex(P->rmap, lrowindices[i], &owner, &lidx));
5469     iremote[i].index = lidx;
5470     iremote[i].rank  = owner;
5471   }
5472   /* Create SF to communicate how many nonzero columns for each row */
5473   PetscCall(PetscSFCreate(comm, &sf));
5474   /* SF will figure out the number of nonzero columns for each row, and their
5475    * offsets
5476    * */
5477   PetscCall(PetscSFSetGraph(sf, plocalsize, nrows, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
5478   PetscCall(PetscSFSetFromOptions(sf));
5479   PetscCall(PetscSFSetUp(sf));
5480 
5481   PetscCall(PetscCalloc1(2 * (plocalsize + 1), &roffsets));
5482   PetscCall(PetscCalloc1(2 * plocalsize, &nrcols));
5483   PetscCall(PetscCalloc1(nrows, &pnnz));
5484   roffsets[0] = 0;
5485   roffsets[1] = 0;
5486   for (i = 0; i < plocalsize; i++) {
5487     /* diagonal */
5488     nrcols[i * 2 + 0] = pd->i[i + 1] - pd->i[i];
5489     /* off-diagonal */
5490     nrcols[i * 2 + 1] = po->i[i + 1] - po->i[i];
5491     /* compute offsets so that we relative location for each row */
5492     roffsets[(i + 1) * 2 + 0] = roffsets[i * 2 + 0] + nrcols[i * 2 + 0];
5493     roffsets[(i + 1) * 2 + 1] = roffsets[i * 2 + 1] + nrcols[i * 2 + 1];
5494   }
5495   PetscCall(PetscCalloc1(2 * nrows, &nlcols));
5496   PetscCall(PetscCalloc1(2 * nrows, &loffsets));
5497   /* 'r' means root, and 'l' means leaf */
5498   PetscCall(PetscSFBcastBegin(sf, MPIU_2INT, nrcols, nlcols, MPI_REPLACE));
5499   PetscCall(PetscSFBcastBegin(sf, MPIU_2INT, roffsets, loffsets, MPI_REPLACE));
5500   PetscCall(PetscSFBcastEnd(sf, MPIU_2INT, nrcols, nlcols, MPI_REPLACE));
5501   PetscCall(PetscSFBcastEnd(sf, MPIU_2INT, roffsets, loffsets, MPI_REPLACE));
5502   PetscCall(PetscSFDestroy(&sf));
5503   PetscCall(PetscFree(roffsets));
5504   PetscCall(PetscFree(nrcols));
5505   dntotalcols = 0;
5506   ontotalcols = 0;
5507   ncol        = 0;
5508   for (i = 0; i < nrows; i++) {
5509     pnnz[i] = nlcols[i * 2 + 0] + nlcols[i * 2 + 1];
5510     ncol    = PetscMax(pnnz[i], ncol);
5511     /* diagonal */
5512     dntotalcols += nlcols[i * 2 + 0];
5513     /* off-diagonal */
5514     ontotalcols += nlcols[i * 2 + 1];
5515   }
5516   /* We do not need to figure the right number of columns
5517    * since all the calculations will be done by going through the raw data
5518    * */
5519   PetscCall(MatCreateSeqAIJ(PETSC_COMM_SELF, nrows, ncol, 0, pnnz, P_oth));
5520   PetscCall(MatSetUp(*P_oth));
5521   PetscCall(PetscFree(pnnz));
5522   p_oth = (Mat_SeqAIJ *)(*P_oth)->data;
5523   /* diagonal */
5524   PetscCall(PetscCalloc1(dntotalcols, &iremote));
5525   /* off-diagonal */
5526   PetscCall(PetscCalloc1(ontotalcols, &oiremote));
5527   /* diagonal */
5528   PetscCall(PetscCalloc1(dntotalcols, &ilocal));
5529   /* off-diagonal */
5530   PetscCall(PetscCalloc1(ontotalcols, &oilocal));
5531   dntotalcols = 0;
5532   ontotalcols = 0;
5533   ntotalcols  = 0;
5534   for (i = 0; i < nrows; i++) {
5535     owner = 0;
5536     PetscCall(PetscLayoutFindOwnerIndex(P->rmap, lrowindices[i], &owner, NULL));
5537     /* Set iremote for diag matrix */
5538     for (j = 0; j < nlcols[i * 2 + 0]; j++) {
5539       iremote[dntotalcols].index = loffsets[i * 2 + 0] + j;
5540       iremote[dntotalcols].rank  = owner;
5541       /* P_oth is seqAIJ so that ilocal need to point to the first part of memory */
5542       ilocal[dntotalcols++] = ntotalcols++;
5543     }
5544     /* off-diagonal */
5545     for (j = 0; j < nlcols[i * 2 + 1]; j++) {
5546       oiremote[ontotalcols].index = loffsets[i * 2 + 1] + j;
5547       oiremote[ontotalcols].rank  = owner;
5548       oilocal[ontotalcols++]      = ntotalcols++;
5549     }
5550   }
5551   PetscCall(ISRestoreIndices(rows, &lrowindices));
5552   PetscCall(PetscFree(loffsets));
5553   PetscCall(PetscFree(nlcols));
5554   PetscCall(PetscSFCreate(comm, &sf));
5555   /* P serves as roots and P_oth is leaves
5556    * Diag matrix
5557    * */
5558   PetscCall(PetscSFSetGraph(sf, pd->i[plocalsize], dntotalcols, ilocal, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
5559   PetscCall(PetscSFSetFromOptions(sf));
5560   PetscCall(PetscSFSetUp(sf));
5561 
5562   PetscCall(PetscSFCreate(comm, &osf));
5563   /* off-diagonal */
5564   PetscCall(PetscSFSetGraph(osf, po->i[plocalsize], ontotalcols, oilocal, PETSC_OWN_POINTER, oiremote, PETSC_OWN_POINTER));
5565   PetscCall(PetscSFSetFromOptions(osf));
5566   PetscCall(PetscSFSetUp(osf));
5567   PetscCall(MatSeqAIJGetArrayRead(p->A, &pd_a));
5568   PetscCall(MatSeqAIJGetArrayRead(p->B, &po_a));
5569   /* operate on the matrix internal data to save memory */
5570   PetscCall(PetscSFBcastBegin(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5571   PetscCall(PetscSFBcastBegin(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5572   PetscCall(MatGetOwnershipRangeColumn(P, &pcstart, NULL));
5573   /* Convert to global indices for diag matrix */
5574   for (i = 0; i < pd->i[plocalsize]; i++) pd->j[i] += pcstart;
5575   PetscCall(PetscSFBcastBegin(sf, MPIU_INT, pd->j, p_oth->j, MPI_REPLACE));
5576   /* We want P_oth store global indices */
5577   PetscCall(ISLocalToGlobalMappingCreate(comm, 1, p->B->cmap->n, p->garray, PETSC_COPY_VALUES, &mapping));
5578   /* Use memory scalable approach */
5579   PetscCall(ISLocalToGlobalMappingSetType(mapping, ISLOCALTOGLOBALMAPPINGHASH));
5580   PetscCall(ISLocalToGlobalMappingApply(mapping, po->i[plocalsize], po->j, po->j));
5581   PetscCall(PetscSFBcastBegin(osf, MPIU_INT, po->j, p_oth->j, MPI_REPLACE));
5582   PetscCall(PetscSFBcastEnd(sf, MPIU_INT, pd->j, p_oth->j, MPI_REPLACE));
5583   /* Convert back to local indices */
5584   for (i = 0; i < pd->i[plocalsize]; i++) pd->j[i] -= pcstart;
5585   PetscCall(PetscSFBcastEnd(osf, MPIU_INT, po->j, p_oth->j, MPI_REPLACE));
5586   nout = 0;
5587   PetscCall(ISGlobalToLocalMappingApply(mapping, IS_GTOLM_DROP, po->i[plocalsize], po->j, &nout, po->j));
5588   PetscCheck(nout == po->i[plocalsize], comm, PETSC_ERR_ARG_INCOMP, "n %" PetscInt_FMT " does not equal to nout %" PetscInt_FMT " ", po->i[plocalsize], nout);
5589   PetscCall(ISLocalToGlobalMappingDestroy(&mapping));
5590   /* Exchange values */
5591   PetscCall(PetscSFBcastEnd(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5592   PetscCall(PetscSFBcastEnd(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5593   PetscCall(MatSeqAIJRestoreArrayRead(p->A, &pd_a));
5594   PetscCall(MatSeqAIJRestoreArrayRead(p->B, &po_a));
5595   /* Stop PETSc from shrinking memory */
5596   for (i = 0; i < nrows; i++) p_oth->ilen[i] = p_oth->imax[i];
5597   PetscCall(MatAssemblyBegin(*P_oth, MAT_FINAL_ASSEMBLY));
5598   PetscCall(MatAssemblyEnd(*P_oth, MAT_FINAL_ASSEMBLY));
5599   /* Attach PetscSF objects to P_oth so that we can reuse it later */
5600   PetscCall(PetscObjectCompose((PetscObject)*P_oth, "diagsf", (PetscObject)sf));
5601   PetscCall(PetscObjectCompose((PetscObject)*P_oth, "offdiagsf", (PetscObject)osf));
5602   PetscCall(PetscSFDestroy(&sf));
5603   PetscCall(PetscSFDestroy(&osf));
5604   PetscFunctionReturn(PETSC_SUCCESS);
5605 }
5606 
5607 /*
5608  * Creates a SeqAIJ matrix by taking rows of B that equal to nonzero columns of local A
5609  * This supports MPIAIJ and MAIJ
5610  * */
5611 PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat A, Mat P, PetscInt dof, MatReuse reuse, Mat *P_oth)
5612 {
5613   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *p = (Mat_MPIAIJ *)P->data;
5614   Mat_SeqAIJ *p_oth;
5615   IS          rows, map;
5616   PetscHMapI  hamp;
5617   PetscInt    i, htsize, *rowindices, off, *mapping, key, count;
5618   MPI_Comm    comm;
5619   PetscSF     sf, osf;
5620   PetscBool   has;
5621 
5622   PetscFunctionBegin;
5623   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
5624   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAocols, A, P, 0, 0));
5625   /* If it is the first time, create an index set of off-diag nonzero columns of A,
5626    *  and then create a submatrix (that often is an overlapping matrix)
5627    * */
5628   if (reuse == MAT_INITIAL_MATRIX) {
5629     /* Use a hash table to figure out unique keys */
5630     PetscCall(PetscHMapICreateWithSize(a->B->cmap->n, &hamp));
5631     PetscCall(PetscCalloc1(a->B->cmap->n, &mapping));
5632     count = 0;
5633     /* Assume that  a->g is sorted, otherwise the following does not make sense */
5634     for (i = 0; i < a->B->cmap->n; i++) {
5635       key = a->garray[i] / dof;
5636       PetscCall(PetscHMapIHas(hamp, key, &has));
5637       if (!has) {
5638         mapping[i] = count;
5639         PetscCall(PetscHMapISet(hamp, key, count++));
5640       } else {
5641         /* Current 'i' has the same value the previous step */
5642         mapping[i] = count - 1;
5643       }
5644     }
5645     PetscCall(ISCreateGeneral(comm, a->B->cmap->n, mapping, PETSC_OWN_POINTER, &map));
5646     PetscCall(PetscHMapIGetSize(hamp, &htsize));
5647     PetscCheck(htsize == count, comm, PETSC_ERR_ARG_INCOMP, " Size of hash map %" PetscInt_FMT " is inconsistent with count %" PetscInt_FMT, htsize, count);
5648     PetscCall(PetscCalloc1(htsize, &rowindices));
5649     off = 0;
5650     PetscCall(PetscHMapIGetKeys(hamp, &off, rowindices));
5651     PetscCall(PetscHMapIDestroy(&hamp));
5652     PetscCall(PetscSortInt(htsize, rowindices));
5653     PetscCall(ISCreateGeneral(comm, htsize, rowindices, PETSC_OWN_POINTER, &rows));
5654     /* In case, the matrix was already created but users want to recreate the matrix */
5655     PetscCall(MatDestroy(P_oth));
5656     PetscCall(MatCreateSeqSubMatrixWithRows_Private(P, rows, P_oth));
5657     PetscCall(PetscObjectCompose((PetscObject)*P_oth, "aoffdiagtopothmapping", (PetscObject)map));
5658     PetscCall(ISDestroy(&map));
5659     PetscCall(ISDestroy(&rows));
5660   } else if (reuse == MAT_REUSE_MATRIX) {
5661     /* If matrix was already created, we simply update values using SF objects
5662      * that as attached to the matrix earlier.
5663      */
5664     const PetscScalar *pd_a, *po_a;
5665 
5666     PetscCall(PetscObjectQuery((PetscObject)*P_oth, "diagsf", (PetscObject *)&sf));
5667     PetscCall(PetscObjectQuery((PetscObject)*P_oth, "offdiagsf", (PetscObject *)&osf));
5668     PetscCheck(sf && osf, comm, PETSC_ERR_ARG_NULL, "Matrix is not initialized yet");
5669     p_oth = (Mat_SeqAIJ *)(*P_oth)->data;
5670     /* Update values in place */
5671     PetscCall(MatSeqAIJGetArrayRead(p->A, &pd_a));
5672     PetscCall(MatSeqAIJGetArrayRead(p->B, &po_a));
5673     PetscCall(PetscSFBcastBegin(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5674     PetscCall(PetscSFBcastBegin(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5675     PetscCall(PetscSFBcastEnd(sf, MPIU_SCALAR, pd_a, p_oth->a, MPI_REPLACE));
5676     PetscCall(PetscSFBcastEnd(osf, MPIU_SCALAR, po_a, p_oth->a, MPI_REPLACE));
5677     PetscCall(MatSeqAIJRestoreArrayRead(p->A, &pd_a));
5678     PetscCall(MatSeqAIJRestoreArrayRead(p->B, &po_a));
5679   } else SETERRQ(comm, PETSC_ERR_ARG_UNKNOWN_TYPE, "Unknown reuse type");
5680   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAocols, A, P, 0, 0));
5681   PetscFunctionReturn(PETSC_SUCCESS);
5682 }
5683 
5684 /*@C
5685   MatGetBrowsOfAcols - Returns `IS` that contain rows of `B` that equal to nonzero columns of local `A`
5686 
5687   Collective
5688 
5689   Input Parameters:
5690 + A     - the first matrix in `MATMPIAIJ` format
5691 . B     - the second matrix in `MATMPIAIJ` format
5692 - scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5693 
5694   Output Parameters:
5695 + rowb  - On input index sets of rows of B to extract (or `NULL`), modified on output
5696 . colb  - On input index sets of columns of B to extract (or `NULL`), modified on output
5697 - B_seq - the sequential matrix generated
5698 
5699   Level: developer
5700 
5701 .seealso: `Mat`, `MATMPIAIJ`, `IS`, `MatReuse`
5702 @*/
5703 PetscErrorCode MatGetBrowsOfAcols(Mat A, Mat B, MatReuse scall, IS *rowb, IS *colb, Mat *B_seq)
5704 {
5705   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
5706   PetscInt   *idx, i, start, ncols, nzA, nzB, *cmap, imark;
5707   IS          isrowb, iscolb;
5708   Mat        *bseq = NULL;
5709 
5710   PetscFunctionBegin;
5711   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
5712              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
5713   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAcols, A, B, 0, 0));
5714 
5715   if (scall == MAT_INITIAL_MATRIX) {
5716     start = A->cmap->rstart;
5717     cmap  = a->garray;
5718     nzA   = a->A->cmap->n;
5719     nzB   = a->B->cmap->n;
5720     PetscCall(PetscMalloc1(nzA + nzB, &idx));
5721     ncols = 0;
5722     for (i = 0; i < nzB; i++) { /* row < local row index */
5723       if (cmap[i] < start) idx[ncols++] = cmap[i];
5724       else break;
5725     }
5726     imark = i;
5727     for (i = 0; i < nzA; i++) idx[ncols++] = start + i;   /* local rows */
5728     for (i = imark; i < nzB; i++) idx[ncols++] = cmap[i]; /* row > local row index */
5729     PetscCall(ISCreateGeneral(PETSC_COMM_SELF, ncols, idx, PETSC_OWN_POINTER, &isrowb));
5730     PetscCall(ISCreateStride(PETSC_COMM_SELF, B->cmap->N, 0, 1, &iscolb));
5731   } else {
5732     PetscCheck(rowb && colb, PETSC_COMM_SELF, PETSC_ERR_SUP, "IS rowb and colb must be provided for MAT_REUSE_MATRIX");
5733     isrowb = *rowb;
5734     iscolb = *colb;
5735     PetscCall(PetscMalloc1(1, &bseq));
5736     bseq[0] = *B_seq;
5737   }
5738   PetscCall(MatCreateSubMatrices(B, 1, &isrowb, &iscolb, scall, &bseq));
5739   *B_seq = bseq[0];
5740   PetscCall(PetscFree(bseq));
5741   if (!rowb) {
5742     PetscCall(ISDestroy(&isrowb));
5743   } else {
5744     *rowb = isrowb;
5745   }
5746   if (!colb) {
5747     PetscCall(ISDestroy(&iscolb));
5748   } else {
5749     *colb = iscolb;
5750   }
5751   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAcols, A, B, 0, 0));
5752   PetscFunctionReturn(PETSC_SUCCESS);
5753 }
5754 
5755 /*
5756     MatGetBrowsOfAoCols_MPIAIJ - Creates a `MATSEQAIJ` matrix by taking rows of B that equal to nonzero columns
5757     of the OFF-DIAGONAL portion of local A
5758 
5759     Collective
5760 
5761    Input Parameters:
5762 +    A,B - the matrices in `MATMPIAIJ` format
5763 -    scall - either `MAT_INITIAL_MATRIX` or `MAT_REUSE_MATRIX`
5764 
5765    Output Parameter:
5766 +    startsj_s - starting point in B's sending j-arrays, saved for MAT_REUSE (or NULL)
5767 .    startsj_r - starting point in B's receiving j-arrays, saved for MAT_REUSE (or NULL)
5768 .    bufa_ptr - array for sending matrix values, saved for MAT_REUSE (or NULL)
5769 -    B_oth - the sequential matrix generated with size aBn=a->B->cmap->n by B->cmap->N
5770 
5771     Developer Note:
5772     This directly accesses information inside the VecScatter associated with the matrix-vector product
5773      for this matrix. This is not desirable..
5774 
5775     Level: developer
5776 
5777 */
5778 
5779 PetscErrorCode MatGetBrowsOfAoCols_MPIAIJ(Mat A, Mat B, MatReuse scall, PetscInt **startsj_s, PetscInt **startsj_r, MatScalar **bufa_ptr, Mat *B_oth)
5780 {
5781   Mat_MPIAIJ        *a = (Mat_MPIAIJ *)A->data;
5782   VecScatter         ctx;
5783   MPI_Comm           comm;
5784   const PetscMPIInt *rprocs, *sprocs;
5785   PetscMPIInt        nrecvs, nsends;
5786   const PetscInt    *srow, *rstarts, *sstarts;
5787   PetscInt          *rowlen, *bufj, *bufJ, ncols = 0, aBn = a->B->cmap->n, row, *b_othi, *b_othj, *rvalues = NULL, *svalues = NULL, *cols, sbs, rbs;
5788   PetscInt           i, j, k = 0, l, ll, nrows, *rstartsj = NULL, *sstartsj, len;
5789   PetscScalar       *b_otha, *bufa, *bufA, *vals = NULL;
5790   MPI_Request       *reqs = NULL, *rwaits = NULL, *swaits = NULL;
5791   PetscMPIInt        size, tag, rank, nreqs;
5792 
5793   PetscFunctionBegin;
5794   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
5795   PetscCallMPI(MPI_Comm_size(comm, &size));
5796 
5797   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
5798              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
5799   PetscCall(PetscLogEventBegin(MAT_GetBrowsOfAocols, A, B, 0, 0));
5800   PetscCallMPI(MPI_Comm_rank(comm, &rank));
5801 
5802   if (size == 1) {
5803     startsj_s = NULL;
5804     bufa_ptr  = NULL;
5805     *B_oth    = NULL;
5806     PetscFunctionReturn(PETSC_SUCCESS);
5807   }
5808 
5809   ctx = a->Mvctx;
5810   tag = ((PetscObject)ctx)->tag;
5811 
5812   PetscCall(VecScatterGetRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &srow, &sprocs, &sbs));
5813   /* rprocs[] must be ordered so that indices received from them are ordered in rvalues[], which is key to algorithms used in this subroutine */
5814   PetscCall(VecScatterGetRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL /*indices not needed*/, &rprocs, &rbs));
5815   PetscCall(PetscMPIIntCast(nsends + nrecvs, &nreqs));
5816   PetscCall(PetscMalloc1(nreqs, &reqs));
5817   rwaits = reqs;
5818   swaits = PetscSafePointerPlusOffset(reqs, nrecvs);
5819 
5820   if (!startsj_s || !bufa_ptr) scall = MAT_INITIAL_MATRIX;
5821   if (scall == MAT_INITIAL_MATRIX) {
5822     /* i-array */
5823     /*  post receives */
5824     if (nrecvs) PetscCall(PetscMalloc1(rbs * (rstarts[nrecvs] - rstarts[0]), &rvalues)); /* rstarts can be NULL when nrecvs=0 */
5825     for (i = 0; i < nrecvs; i++) {
5826       rowlen = rvalues + rstarts[i] * rbs;
5827       nrows  = (rstarts[i + 1] - rstarts[i]) * rbs; /* num of indices to be received */
5828       PetscCallMPI(MPIU_Irecv(rowlen, nrows, MPIU_INT, rprocs[i], tag, comm, rwaits + i));
5829     }
5830 
5831     /* pack the outgoing message */
5832     PetscCall(PetscMalloc2(nsends + 1, &sstartsj, nrecvs + 1, &rstartsj));
5833 
5834     sstartsj[0] = 0;
5835     rstartsj[0] = 0;
5836     len         = 0; /* total length of j or a array to be sent */
5837     if (nsends) {
5838       k = sstarts[0]; /* ATTENTION: sstarts[0] and rstarts[0] are not necessarily zero */
5839       PetscCall(PetscMalloc1(sbs * (sstarts[nsends] - sstarts[0]), &svalues));
5840     }
5841     for (i = 0; i < nsends; i++) {
5842       rowlen = svalues + (sstarts[i] - sstarts[0]) * sbs;
5843       nrows  = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5844       for (j = 0; j < nrows; j++) {
5845         row = srow[k] + B->rmap->range[rank]; /* global row idx */
5846         for (l = 0; l < sbs; l++) {
5847           PetscCall(MatGetRow_MPIAIJ(B, row + l, &ncols, NULL, NULL)); /* rowlength */
5848 
5849           rowlen[j * sbs + l] = ncols;
5850 
5851           len += ncols;
5852           PetscCall(MatRestoreRow_MPIAIJ(B, row + l, &ncols, NULL, NULL));
5853         }
5854         k++;
5855       }
5856       PetscCallMPI(MPIU_Isend(rowlen, nrows * sbs, MPIU_INT, sprocs[i], tag, comm, swaits + i));
5857 
5858       sstartsj[i + 1] = len; /* starting point of (i+1)-th outgoing msg in bufj and bufa */
5859     }
5860     /* recvs and sends of i-array are completed */
5861     if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5862     PetscCall(PetscFree(svalues));
5863 
5864     /* allocate buffers for sending j and a arrays */
5865     PetscCall(PetscMalloc1(len + 1, &bufj));
5866     PetscCall(PetscMalloc1(len + 1, &bufa));
5867 
5868     /* create i-array of B_oth */
5869     PetscCall(PetscMalloc1(aBn + 2, &b_othi));
5870 
5871     b_othi[0] = 0;
5872     len       = 0; /* total length of j or a array to be received */
5873     k         = 0;
5874     for (i = 0; i < nrecvs; i++) {
5875       rowlen = rvalues + (rstarts[i] - rstarts[0]) * rbs;
5876       nrows  = (rstarts[i + 1] - rstarts[i]) * rbs; /* num of rows to be received */
5877       for (j = 0; j < nrows; j++) {
5878         b_othi[k + 1] = b_othi[k] + rowlen[j];
5879         PetscCall(PetscIntSumError(rowlen[j], len, &len));
5880         k++;
5881       }
5882       rstartsj[i + 1] = len; /* starting point of (i+1)-th incoming msg in bufj and bufa */
5883     }
5884     PetscCall(PetscFree(rvalues));
5885 
5886     /* allocate space for j and a arrays of B_oth */
5887     PetscCall(PetscMalloc1(b_othi[aBn] + 1, &b_othj));
5888     PetscCall(PetscMalloc1(b_othi[aBn] + 1, &b_otha));
5889 
5890     /* j-array */
5891     /*  post receives of j-array */
5892     for (i = 0; i < nrecvs; i++) {
5893       nrows = rstartsj[i + 1] - rstartsj[i]; /* length of the msg received */
5894       PetscCallMPI(MPIU_Irecv(b_othj + rstartsj[i], nrows, MPIU_INT, rprocs[i], tag, comm, rwaits + i));
5895     }
5896 
5897     /* pack the outgoing message j-array */
5898     if (nsends) k = sstarts[0];
5899     for (i = 0; i < nsends; i++) {
5900       nrows = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5901       bufJ  = bufj + sstartsj[i];
5902       for (j = 0; j < nrows; j++) {
5903         row = srow[k++] + B->rmap->range[rank]; /* global row idx */
5904         for (ll = 0; ll < sbs; ll++) {
5905           PetscCall(MatGetRow_MPIAIJ(B, row + ll, &ncols, &cols, NULL));
5906           for (l = 0; l < ncols; l++) *bufJ++ = cols[l];
5907           PetscCall(MatRestoreRow_MPIAIJ(B, row + ll, &ncols, &cols, NULL));
5908         }
5909       }
5910       PetscCallMPI(MPIU_Isend(bufj + sstartsj[i], sstartsj[i + 1] - sstartsj[i], MPIU_INT, sprocs[i], tag, comm, swaits + i));
5911     }
5912 
5913     /* recvs and sends of j-array are completed */
5914     if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5915   } else if (scall == MAT_REUSE_MATRIX) {
5916     sstartsj = *startsj_s;
5917     rstartsj = *startsj_r;
5918     bufa     = *bufa_ptr;
5919     PetscCall(MatSeqAIJGetArrayWrite(*B_oth, &b_otha));
5920   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix P does not possess an object container");
5921 
5922   /* a-array */
5923   /*  post receives of a-array */
5924   for (i = 0; i < nrecvs; i++) {
5925     nrows = rstartsj[i + 1] - rstartsj[i]; /* length of the msg received */
5926     PetscCallMPI(MPIU_Irecv(b_otha + rstartsj[i], nrows, MPIU_SCALAR, rprocs[i], tag, comm, rwaits + i));
5927   }
5928 
5929   /* pack the outgoing message a-array */
5930   if (nsends) k = sstarts[0];
5931   for (i = 0; i < nsends; i++) {
5932     nrows = sstarts[i + 1] - sstarts[i]; /* num of block rows */
5933     bufA  = bufa + sstartsj[i];
5934     for (j = 0; j < nrows; j++) {
5935       row = srow[k++] + B->rmap->range[rank]; /* global row idx */
5936       for (ll = 0; ll < sbs; ll++) {
5937         PetscCall(MatGetRow_MPIAIJ(B, row + ll, &ncols, NULL, &vals));
5938         for (l = 0; l < ncols; l++) *bufA++ = vals[l];
5939         PetscCall(MatRestoreRow_MPIAIJ(B, row + ll, &ncols, NULL, &vals));
5940       }
5941     }
5942     PetscCallMPI(MPIU_Isend(bufa + sstartsj[i], sstartsj[i + 1] - sstartsj[i], MPIU_SCALAR, sprocs[i], tag, comm, swaits + i));
5943   }
5944   /* recvs and sends of a-array are completed */
5945   if (nreqs) PetscCallMPI(MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE));
5946   PetscCall(PetscFree(reqs));
5947 
5948   if (scall == MAT_INITIAL_MATRIX) {
5949     Mat_SeqAIJ *b_oth;
5950 
5951     /* put together the new matrix */
5952     PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, aBn, B->cmap->N, b_othi, b_othj, b_otha, B_oth));
5953 
5954     /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */
5955     /* Since these are PETSc arrays, change flags to free them as necessary. */
5956     b_oth          = (Mat_SeqAIJ *)(*B_oth)->data;
5957     b_oth->free_a  = PETSC_TRUE;
5958     b_oth->free_ij = PETSC_TRUE;
5959     b_oth->nonew   = 0;
5960 
5961     PetscCall(PetscFree(bufj));
5962     if (!startsj_s || !bufa_ptr) {
5963       PetscCall(PetscFree2(sstartsj, rstartsj));
5964       PetscCall(PetscFree(bufa_ptr));
5965     } else {
5966       *startsj_s = sstartsj;
5967       *startsj_r = rstartsj;
5968       *bufa_ptr  = bufa;
5969     }
5970   } else if (scall == MAT_REUSE_MATRIX) {
5971     PetscCall(MatSeqAIJRestoreArrayWrite(*B_oth, &b_otha));
5972   }
5973 
5974   PetscCall(VecScatterRestoreRemote_Private(ctx, PETSC_TRUE, &nsends, &sstarts, &srow, &sprocs, &sbs));
5975   PetscCall(VecScatterRestoreRemoteOrdered_Private(ctx, PETSC_FALSE, &nrecvs, &rstarts, NULL, &rprocs, &rbs));
5976   PetscCall(PetscLogEventEnd(MAT_GetBrowsOfAocols, A, B, 0, 0));
5977   PetscFunctionReturn(PETSC_SUCCESS);
5978 }
5979 
5980 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCRL(Mat, MatType, MatReuse, Mat *);
5981 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJPERM(Mat, MatType, MatReuse, Mat *);
5982 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJSELL(Mat, MatType, MatReuse, Mat *);
5983 #if defined(PETSC_HAVE_MKL_SPARSE)
5984 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJMKL(Mat, MatType, MatReuse, Mat *);
5985 #endif
5986 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIBAIJ(Mat, MatType, MatReuse, Mat *);
5987 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPISBAIJ(Mat, MatType, MatReuse, Mat *);
5988 #if defined(PETSC_HAVE_ELEMENTAL)
5989 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_Elemental(Mat, MatType, MatReuse, Mat *);
5990 #endif
5991 #if defined(PETSC_HAVE_SCALAPACK)
5992 PETSC_INTERN PetscErrorCode MatConvert_AIJ_ScaLAPACK(Mat, MatType, MatReuse, Mat *);
5993 #endif
5994 #if defined(PETSC_HAVE_HYPRE)
5995 PETSC_INTERN PetscErrorCode MatConvert_AIJ_HYPRE(Mat, MatType, MatReuse, Mat *);
5996 #endif
5997 #if defined(PETSC_HAVE_CUDA)
5998 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
5999 #endif
6000 #if defined(PETSC_HAVE_HIP)
6001 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJHIPSPARSE(Mat, MatType, MatReuse, Mat *);
6002 #endif
6003 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
6004 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat, MatType, MatReuse, Mat *);
6005 #endif
6006 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPISELL(Mat, MatType, MatReuse, Mat *);
6007 PETSC_INTERN PetscErrorCode MatConvert_XAIJ_IS(Mat, MatType, MatReuse, Mat *);
6008 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_IS_XAIJ(Mat);
6009 
6010 /*
6011     Computes (B'*A')' since computing B*A directly is untenable
6012 
6013                n                       p                          p
6014         [             ]       [             ]         [                 ]
6015       m [      A      ]  *  n [       B     ]   =   m [         C       ]
6016         [             ]       [             ]         [                 ]
6017 
6018 */
6019 static PetscErrorCode MatMatMultNumeric_MPIDense_MPIAIJ(Mat A, Mat B, Mat C)
6020 {
6021   Mat At, Bt, Ct;
6022 
6023   PetscFunctionBegin;
6024   PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At));
6025   PetscCall(MatTranspose(B, MAT_INITIAL_MATRIX, &Bt));
6026   PetscCall(MatMatMult(Bt, At, MAT_INITIAL_MATRIX, PETSC_CURRENT, &Ct));
6027   PetscCall(MatDestroy(&At));
6028   PetscCall(MatDestroy(&Bt));
6029   PetscCall(MatTransposeSetPrecursor(Ct, C));
6030   PetscCall(MatTranspose(Ct, MAT_REUSE_MATRIX, &C));
6031   PetscCall(MatDestroy(&Ct));
6032   PetscFunctionReturn(PETSC_SUCCESS);
6033 }
6034 
6035 static PetscErrorCode MatMatMultSymbolic_MPIDense_MPIAIJ(Mat A, Mat B, PetscReal fill, Mat C)
6036 {
6037   PetscBool cisdense;
6038 
6039   PetscFunctionBegin;
6040   PetscCheck(A->cmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "A->cmap->n %" PetscInt_FMT " != B->rmap->n %" PetscInt_FMT, A->cmap->n, B->rmap->n);
6041   PetscCall(MatSetSizes(C, A->rmap->n, B->cmap->n, A->rmap->N, B->cmap->N));
6042   PetscCall(MatSetBlockSizesFromMats(C, A, B));
6043   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &cisdense, MATMPIDENSE, MATMPIDENSECUDA, MATMPIDENSEHIP, ""));
6044   if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
6045   PetscCall(MatSetUp(C));
6046 
6047   C->ops->matmultnumeric = MatMatMultNumeric_MPIDense_MPIAIJ;
6048   PetscFunctionReturn(PETSC_SUCCESS);
6049 }
6050 
6051 static PetscErrorCode MatProductSetFromOptions_MPIDense_MPIAIJ_AB(Mat C)
6052 {
6053   Mat_Product *product = C->product;
6054   Mat          A = product->A, B = product->B;
6055 
6056   PetscFunctionBegin;
6057   PetscCheck(A->cmap->rstart == B->rmap->rstart && A->cmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")",
6058              A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend);
6059   C->ops->matmultsymbolic = MatMatMultSymbolic_MPIDense_MPIAIJ;
6060   C->ops->productsymbolic = MatProductSymbolic_AB;
6061   PetscFunctionReturn(PETSC_SUCCESS);
6062 }
6063 
6064 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIDense_MPIAIJ(Mat C)
6065 {
6066   Mat_Product *product = C->product;
6067 
6068   PetscFunctionBegin;
6069   if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_MPIDense_MPIAIJ_AB(C));
6070   PetscFunctionReturn(PETSC_SUCCESS);
6071 }
6072 
6073 /*
6074    Merge two sets of sorted nonzeros and return a CSR for the merged (sequential) matrix
6075 
6076   Input Parameters:
6077 
6078     j1,rowBegin1,rowEnd1,jmap1: describe the first set of nonzeros (Set1)
6079     j2,rowBegin2,rowEnd2,jmap2: describe the second set of nonzeros (Set2)
6080 
6081     mat: both sets' nonzeros are on m rows, where m is the number of local rows of the matrix mat
6082 
6083     For Set1, j1[] contains column indices of the nonzeros.
6084     For the k-th row (0<=k<m), [rowBegin1[k],rowEnd1[k]) index into j1[] and point to the begin/end nonzero in row k
6085     respectively (note rowEnd1[k] is not necessarily equal to rwoBegin1[k+1]). Indices in this range of j1[] are sorted,
6086     but might have repeats. jmap1[t+1] - jmap1[t] is the number of repeats for the t-th unique nonzero in Set1.
6087 
6088     Similar for Set2.
6089 
6090     This routine merges the two sets of nonzeros row by row and removes repeats.
6091 
6092   Output Parameters: (memory is allocated by the caller)
6093 
6094     i[],j[]: the CSR of the merged matrix, which has m rows.
6095     imap1[]: the k-th unique nonzero in Set1 (k=0,1,...) corresponds to imap1[k]-th unique nonzero in the merged matrix.
6096     imap2[]: similar to imap1[], but for Set2.
6097     Note we order nonzeros row-by-row and from left to right.
6098 */
6099 static PetscErrorCode MatMergeEntries_Internal(Mat mat, const PetscInt j1[], const PetscInt j2[], const PetscCount rowBegin1[], const PetscCount rowEnd1[], const PetscCount rowBegin2[], const PetscCount rowEnd2[], const PetscCount jmap1[], const PetscCount jmap2[], PetscCount imap1[], PetscCount imap2[], PetscInt i[], PetscInt j[])
6100 {
6101   PetscInt   r, m; /* Row index of mat */
6102   PetscCount t, t1, t2, b1, e1, b2, e2;
6103 
6104   PetscFunctionBegin;
6105   PetscCall(MatGetLocalSize(mat, &m, NULL));
6106   t1 = t2 = t = 0; /* Count unique nonzeros of in Set1, Set1 and the merged respectively */
6107   i[0]        = 0;
6108   for (r = 0; r < m; r++) { /* Do row by row merging */
6109     b1 = rowBegin1[r];
6110     e1 = rowEnd1[r];
6111     b2 = rowBegin2[r];
6112     e2 = rowEnd2[r];
6113     while (b1 < e1 && b2 < e2) {
6114       if (j1[b1] == j2[b2]) { /* Same column index and hence same nonzero */
6115         j[t]      = j1[b1];
6116         imap1[t1] = t;
6117         imap2[t2] = t;
6118         b1 += jmap1[t1 + 1] - jmap1[t1]; /* Jump to next unique local nonzero */
6119         b2 += jmap2[t2 + 1] - jmap2[t2]; /* Jump to next unique remote nonzero */
6120         t1++;
6121         t2++;
6122         t++;
6123       } else if (j1[b1] < j2[b2]) {
6124         j[t]      = j1[b1];
6125         imap1[t1] = t;
6126         b1 += jmap1[t1 + 1] - jmap1[t1];
6127         t1++;
6128         t++;
6129       } else {
6130         j[t]      = j2[b2];
6131         imap2[t2] = t;
6132         b2 += jmap2[t2 + 1] - jmap2[t2];
6133         t2++;
6134         t++;
6135       }
6136     }
6137     /* Merge the remaining in either j1[] or j2[] */
6138     while (b1 < e1) {
6139       j[t]      = j1[b1];
6140       imap1[t1] = t;
6141       b1 += jmap1[t1 + 1] - jmap1[t1];
6142       t1++;
6143       t++;
6144     }
6145     while (b2 < e2) {
6146       j[t]      = j2[b2];
6147       imap2[t2] = t;
6148       b2 += jmap2[t2 + 1] - jmap2[t2];
6149       t2++;
6150       t++;
6151     }
6152     PetscCall(PetscIntCast(t, i + r + 1));
6153   }
6154   PetscFunctionReturn(PETSC_SUCCESS);
6155 }
6156 
6157 /*
6158   Split nonzeros in a block of local rows into two subsets: those in the diagonal block and those in the off-diagonal block
6159 
6160   Input Parameters:
6161     mat: an MPI matrix that provides row and column layout information for splitting. Let's say its number of local rows is m.
6162     n,i[],j[],perm[]: there are n input entries, belonging to m rows. Row/col indices of the entries are stored in i[] and j[]
6163       respectively, along with a permutation array perm[]. Length of the i[],j[],perm[] arrays is n.
6164 
6165       i[] is already sorted, but within a row, j[] is not sorted and might have repeats.
6166       i[] might contain negative indices at the beginning, which means the corresponding entries should be ignored in the splitting.
6167 
6168   Output Parameters:
6169     j[],perm[]: the routine needs to sort j[] within each row along with perm[].
6170     rowBegin[],rowMid[],rowEnd[]: of length m, and the memory is preallocated and zeroed by the caller.
6171       They contain indices pointing to j[]. For 0<=r<m, [rowBegin[r],rowMid[r]) point to begin/end entries of row r of the diagonal block,
6172       and [rowMid[r],rowEnd[r]) point to begin/end entries of row r of the off-diagonal block.
6173 
6174     Aperm[],Ajmap[],Atot,Annz: Arrays are allocated by this routine.
6175       Atot: number of entries belonging to the diagonal block.
6176       Annz: number of unique nonzeros belonging to the diagonal block.
6177       Aperm[Atot] stores values from perm[] for entries belonging to the diagonal block. Length of Aperm[] is Atot, though it may also count
6178         repeats (i.e., same 'i,j' pair).
6179       Ajmap[Annz+1] stores the number of repeats of each unique entry belonging to the diagonal block. More precisely, Ajmap[t+1] - Ajmap[t]
6180         is the number of repeats for the t-th unique entry in the diagonal block. Ajmap[0] is always 0.
6181 
6182       Atot: number of entries belonging to the diagonal block
6183       Annz: number of unique nonzeros belonging to the diagonal block.
6184 
6185     Bperm[], Bjmap[], Btot, Bnnz are similar but for the off-diagonal block.
6186 
6187     Aperm[],Bperm[],Ajmap[] and Bjmap[] are allocated separately by this routine with PetscMalloc1().
6188 */
6189 static PetscErrorCode MatSplitEntries_Internal(Mat mat, PetscCount n, const PetscInt i[], PetscInt j[], PetscCount perm[], PetscCount rowBegin[], PetscCount rowMid[], PetscCount rowEnd[], PetscCount *Atot_, PetscCount **Aperm_, PetscCount *Annz_, PetscCount **Ajmap_, PetscCount *Btot_, PetscCount **Bperm_, PetscCount *Bnnz_, PetscCount **Bjmap_)
6190 {
6191   PetscInt    cstart, cend, rstart, rend, row, col;
6192   PetscCount  Atot = 0, Btot = 0; /* Total number of nonzeros in the diagonal and off-diagonal blocks */
6193   PetscCount  Annz = 0, Bnnz = 0; /* Number of unique nonzeros in the diagonal and off-diagonal blocks */
6194   PetscCount  k, m, p, q, r, s, mid;
6195   PetscCount *Aperm, *Bperm, *Ajmap, *Bjmap;
6196 
6197   PetscFunctionBegin;
6198   PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
6199   PetscCall(PetscLayoutGetRange(mat->cmap, &cstart, &cend));
6200   m = rend - rstart;
6201 
6202   /* Skip negative rows */
6203   for (k = 0; k < n; k++)
6204     if (i[k] >= 0) break;
6205 
6206   /* Process [k,n): sort and partition each local row into diag and offdiag portions,
6207      fill rowBegin[], rowMid[], rowEnd[], and count Atot, Btot, Annz, Bnnz.
6208   */
6209   while (k < n) {
6210     row = i[k];
6211     /* Entries in [k,s) are in one row. Shift diagonal block col indices so that diag is ahead of offdiag after sorting the row */
6212     for (s = k; s < n; s++)
6213       if (i[s] != row) break;
6214 
6215     /* Shift diag columns to range of [-PETSC_INT_MAX, -1] */
6216     for (p = k; p < s; p++) {
6217       if (j[p] >= cstart && j[p] < cend) j[p] -= PETSC_INT_MAX;
6218     }
6219     PetscCall(PetscSortIntWithCountArray(s - k, j + k, perm + k));
6220     PetscCall(PetscSortedIntUpperBound(j, k, s, -1, &mid)); /* Separate [k,s) into [k,mid) for diag and [mid,s) for offdiag */
6221     rowBegin[row - rstart] = k;
6222     rowMid[row - rstart]   = mid;
6223     rowEnd[row - rstart]   = s;
6224     PetscCheck(k == s || j[s - 1] < mat->cmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column index %" PetscInt_FMT " is >= matrix column size %" PetscInt_FMT, j[s - 1], mat->cmap->N);
6225 
6226     /* Count nonzeros of this diag/offdiag row, which might have repeats */
6227     Atot += mid - k;
6228     Btot += s - mid;
6229 
6230     /* Count unique nonzeros of this diag row */
6231     for (p = k; p < mid;) {
6232       col = j[p];
6233       do {
6234         j[p] += PETSC_INT_MAX; /* Revert the modified diagonal indices */
6235         p++;
6236       } while (p < mid && j[p] == col);
6237       Annz++;
6238     }
6239 
6240     /* Count unique nonzeros of this offdiag row */
6241     for (p = mid; p < s;) {
6242       col = j[p];
6243       do {
6244         p++;
6245       } while (p < s && j[p] == col);
6246       Bnnz++;
6247     }
6248     k = s;
6249   }
6250 
6251   /* Allocation according to Atot, Btot, Annz, Bnnz */
6252   PetscCall(PetscMalloc1(Atot, &Aperm));
6253   PetscCall(PetscMalloc1(Btot, &Bperm));
6254   PetscCall(PetscMalloc1(Annz + 1, &Ajmap));
6255   PetscCall(PetscMalloc1(Bnnz + 1, &Bjmap));
6256 
6257   /* Re-scan indices and copy diag/offdiag permutation indices to Aperm, Bperm and also fill Ajmap and Bjmap */
6258   Ajmap[0] = Bjmap[0] = Atot = Btot = Annz = Bnnz = 0;
6259   for (r = 0; r < m; r++) {
6260     k   = rowBegin[r];
6261     mid = rowMid[r];
6262     s   = rowEnd[r];
6263     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(Aperm, Atot), PetscSafePointerPlusOffset(perm, k), mid - k));
6264     PetscCall(PetscArraycpy(PetscSafePointerPlusOffset(Bperm, Btot), PetscSafePointerPlusOffset(perm, mid), s - mid));
6265     Atot += mid - k;
6266     Btot += s - mid;
6267 
6268     /* Scan column indices in this row and find out how many repeats each unique nonzero has */
6269     for (p = k; p < mid;) {
6270       col = j[p];
6271       q   = p;
6272       do {
6273         p++;
6274       } while (p < mid && j[p] == col);
6275       Ajmap[Annz + 1] = Ajmap[Annz] + (p - q);
6276       Annz++;
6277     }
6278 
6279     for (p = mid; p < s;) {
6280       col = j[p];
6281       q   = p;
6282       do {
6283         p++;
6284       } while (p < s && j[p] == col);
6285       Bjmap[Bnnz + 1] = Bjmap[Bnnz] + (p - q);
6286       Bnnz++;
6287     }
6288   }
6289   /* Output */
6290   *Aperm_ = Aperm;
6291   *Annz_  = Annz;
6292   *Atot_  = Atot;
6293   *Ajmap_ = Ajmap;
6294   *Bperm_ = Bperm;
6295   *Bnnz_  = Bnnz;
6296   *Btot_  = Btot;
6297   *Bjmap_ = Bjmap;
6298   PetscFunctionReturn(PETSC_SUCCESS);
6299 }
6300 
6301 /*
6302   Expand the jmap[] array to make a new one in view of nonzeros in the merged matrix
6303 
6304   Input Parameters:
6305     nnz1: number of unique nonzeros in a set that was used to produce imap[], jmap[]
6306     nnz:  number of unique nonzeros in the merged matrix
6307     imap[nnz1]: i-th nonzero in the set is the imap[i]-th nonzero in the merged matrix
6308     jmap[nnz1+1]: i-th nonzero in the set has jmap[i+1] - jmap[i] repeats in the set
6309 
6310   Output Parameter: (memory is allocated by the caller)
6311     jmap_new[nnz+1]: i-th nonzero in the merged matrix has jmap_new[i+1] - jmap_new[i] repeats in the set
6312 
6313   Example:
6314     nnz1 = 4
6315     nnz  = 6
6316     imap = [1,3,4,5]
6317     jmap = [0,3,5,6,7]
6318    then,
6319     jmap_new = [0,0,3,3,5,6,7]
6320 */
6321 static PetscErrorCode ExpandJmap_Internal(PetscCount nnz1, PetscCount nnz, const PetscCount imap[], const PetscCount jmap[], PetscCount jmap_new[])
6322 {
6323   PetscCount k, p;
6324 
6325   PetscFunctionBegin;
6326   jmap_new[0] = 0;
6327   p           = nnz;                /* p loops over jmap_new[] backwards */
6328   for (k = nnz1 - 1; k >= 0; k--) { /* k loops over imap[] */
6329     for (; p > imap[k]; p--) jmap_new[p] = jmap[k + 1];
6330   }
6331   for (; p >= 0; p--) jmap_new[p] = jmap[0];
6332   PetscFunctionReturn(PETSC_SUCCESS);
6333 }
6334 
6335 static PetscErrorCode MatCOOStructDestroy_MPIAIJ(void **data)
6336 {
6337   MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)*data;
6338 
6339   PetscFunctionBegin;
6340   PetscCall(PetscSFDestroy(&coo->sf));
6341   PetscCall(PetscFree(coo->Aperm1));
6342   PetscCall(PetscFree(coo->Bperm1));
6343   PetscCall(PetscFree(coo->Ajmap1));
6344   PetscCall(PetscFree(coo->Bjmap1));
6345   PetscCall(PetscFree(coo->Aimap2));
6346   PetscCall(PetscFree(coo->Bimap2));
6347   PetscCall(PetscFree(coo->Aperm2));
6348   PetscCall(PetscFree(coo->Bperm2));
6349   PetscCall(PetscFree(coo->Ajmap2));
6350   PetscCall(PetscFree(coo->Bjmap2));
6351   PetscCall(PetscFree(coo->Cperm1));
6352   PetscCall(PetscFree2(coo->sendbuf, coo->recvbuf));
6353   PetscCall(PetscFree(coo));
6354   PetscFunctionReturn(PETSC_SUCCESS);
6355 }
6356 
6357 PetscErrorCode MatSetPreallocationCOO_MPIAIJ(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
6358 {
6359   MPI_Comm             comm;
6360   PetscMPIInt          rank, size;
6361   PetscInt             m, n, M, N, rstart, rend, cstart, cend; /* Sizes, indices of row/col, therefore with type PetscInt */
6362   PetscCount           k, p, q, rem;                           /* Loop variables over coo arrays */
6363   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
6364   PetscContainer       container;
6365   MatCOOStruct_MPIAIJ *coo;
6366 
6367   PetscFunctionBegin;
6368   PetscCall(PetscFree(mpiaij->garray));
6369   PetscCall(VecDestroy(&mpiaij->lvec));
6370 #if defined(PETSC_USE_CTABLE)
6371   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
6372 #else
6373   PetscCall(PetscFree(mpiaij->colmap));
6374 #endif
6375   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
6376   mat->assembled     = PETSC_FALSE;
6377   mat->was_assembled = PETSC_FALSE;
6378 
6379   PetscCall(PetscObjectGetComm((PetscObject)mat, &comm));
6380   PetscCallMPI(MPI_Comm_size(comm, &size));
6381   PetscCallMPI(MPI_Comm_rank(comm, &rank));
6382   PetscCall(PetscLayoutSetUp(mat->rmap));
6383   PetscCall(PetscLayoutSetUp(mat->cmap));
6384   PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
6385   PetscCall(PetscLayoutGetRange(mat->cmap, &cstart, &cend));
6386   PetscCall(MatGetLocalSize(mat, &m, &n));
6387   PetscCall(MatGetSize(mat, &M, &N));
6388 
6389   /* Sort (i,j) by row along with a permutation array, so that the to-be-ignored */
6390   /* entries come first, then local rows, then remote rows.                     */
6391   PetscCount n1 = coo_n, *perm1;
6392   PetscInt  *i1 = coo_i, *j1 = coo_j;
6393 
6394   PetscCall(PetscMalloc1(n1, &perm1));
6395   for (k = 0; k < n1; k++) perm1[k] = k;
6396 
6397   /* Manipulate indices so that entries with negative row or col indices will have smallest
6398      row indices, local entries will have greater but negative row indices, and remote entries
6399      will have positive row indices.
6400   */
6401   for (k = 0; k < n1; k++) {
6402     if (i1[k] < 0 || j1[k] < 0) i1[k] = PETSC_INT_MIN;                /* e.g., -2^31, minimal to move them ahead */
6403     else if (i1[k] >= rstart && i1[k] < rend) i1[k] -= PETSC_INT_MAX; /* e.g., minus 2^31-1 to shift local rows to range of [-PETSC_INT_MAX, -1] */
6404     else {
6405       PetscCheck(!mat->nooffprocentries, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "MAT_NO_OFF_PROC_ENTRIES is set but insert to remote rows");
6406       if (mpiaij->donotstash) i1[k] = PETSC_INT_MIN; /* Ignore offproc entries as if they had negative indices */
6407     }
6408   }
6409 
6410   /* Sort by row; after that, [0,k) have ignored entries, [k,rem) have local rows and [rem,n1) have remote rows */
6411   PetscCall(PetscSortIntWithIntCountArrayPair(n1, i1, j1, perm1));
6412 
6413   /* Advance k to the first entry we need to take care of */
6414   for (k = 0; k < n1; k++)
6415     if (i1[k] > PETSC_INT_MIN) break;
6416   PetscCount i1start = k;
6417 
6418   PetscCall(PetscSortedIntUpperBound(i1, k, n1, rend - 1 - PETSC_INT_MAX, &rem)); /* rem is upper bound of the last local row */
6419   for (; k < rem; k++) i1[k] += PETSC_INT_MAX;                                    /* Revert row indices of local rows*/
6420 
6421   PetscCheck(i1 == NULL || i1[n1 - 1] < M, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "COO row index %" PetscInt_FMT " is >= the matrix row size %" PetscInt_FMT, i1[n1 - 1], M);
6422 
6423   /*           Send remote rows to their owner                                  */
6424   /* Find which rows should be sent to which remote ranks*/
6425   PetscInt        nsend = 0; /* Number of MPI ranks to send data to */
6426   PetscMPIInt    *sendto;    /* [nsend], storing remote ranks */
6427   PetscInt       *nentries;  /* [nsend], storing number of entries sent to remote ranks; Assume PetscInt is big enough for this count, and error if not */
6428   const PetscInt *ranges;
6429   PetscInt        maxNsend = size >= 128 ? 128 : size; /* Assume max 128 neighbors; realloc when needed */
6430 
6431   PetscCall(PetscLayoutGetRanges(mat->rmap, &ranges));
6432   PetscCall(PetscMalloc2(maxNsend, &sendto, maxNsend, &nentries));
6433   for (k = rem; k < n1;) {
6434     PetscMPIInt owner;
6435     PetscInt    firstRow, lastRow;
6436 
6437     /* Locate a row range */
6438     firstRow = i1[k]; /* first row of this owner */
6439     PetscCall(PetscLayoutFindOwner(mat->rmap, firstRow, &owner));
6440     lastRow = ranges[owner + 1] - 1; /* last row of this owner */
6441 
6442     /* Find the first index 'p' in [k,n) with i1[p] belonging to next owner */
6443     PetscCall(PetscSortedIntUpperBound(i1, k, n1, lastRow, &p));
6444 
6445     /* All entries in [k,p) belong to this remote owner */
6446     if (nsend >= maxNsend) { /* Double the remote ranks arrays if not long enough */
6447       PetscMPIInt *sendto2;
6448       PetscInt    *nentries2;
6449       PetscInt     maxNsend2 = (maxNsend <= size / 2) ? maxNsend * 2 : size;
6450 
6451       PetscCall(PetscMalloc2(maxNsend2, &sendto2, maxNsend2, &nentries2));
6452       PetscCall(PetscArraycpy(sendto2, sendto, maxNsend));
6453       PetscCall(PetscArraycpy(nentries2, nentries2, maxNsend + 1));
6454       PetscCall(PetscFree2(sendto, nentries2));
6455       sendto   = sendto2;
6456       nentries = nentries2;
6457       maxNsend = maxNsend2;
6458     }
6459     sendto[nsend] = owner;
6460     PetscCall(PetscIntCast(p - k, &nentries[nsend]));
6461     nsend++;
6462     k = p;
6463   }
6464 
6465   /* Build 1st SF to know offsets on remote to send data */
6466   PetscSF      sf1;
6467   PetscInt     nroots = 1, nroots2 = 0;
6468   PetscInt     nleaves = nsend, nleaves2 = 0;
6469   PetscInt    *offsets;
6470   PetscSFNode *iremote;
6471 
6472   PetscCall(PetscSFCreate(comm, &sf1));
6473   PetscCall(PetscMalloc1(nsend, &iremote));
6474   PetscCall(PetscMalloc1(nsend, &offsets));
6475   for (k = 0; k < nsend; k++) {
6476     iremote[k].rank  = sendto[k];
6477     iremote[k].index = 0;
6478     nleaves2 += nentries[k];
6479     PetscCheck(nleaves2 >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Number of SF leaves is too large for PetscInt");
6480   }
6481   PetscCall(PetscSFSetGraph(sf1, nroots, nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
6482   PetscCall(PetscSFFetchAndOpWithMemTypeBegin(sf1, MPIU_INT, PETSC_MEMTYPE_HOST, &nroots2 /*rootdata*/, PETSC_MEMTYPE_HOST, nentries /*leafdata*/, PETSC_MEMTYPE_HOST, offsets /*leafupdate*/, MPI_SUM));
6483   PetscCall(PetscSFFetchAndOpEnd(sf1, MPIU_INT, &nroots2, nentries, offsets, MPI_SUM)); /* Would nroots2 overflow, we check offsets[] below */
6484   PetscCall(PetscSFDestroy(&sf1));
6485   PetscAssert(nleaves2 == n1 - rem, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nleaves2 %" PetscInt_FMT " != number of remote entries %" PetscCount_FMT, nleaves2, n1 - rem);
6486 
6487   /* Build 2nd SF to send remote COOs to their owner */
6488   PetscSF sf2;
6489   nroots  = nroots2;
6490   nleaves = nleaves2;
6491   PetscCall(PetscSFCreate(comm, &sf2));
6492   PetscCall(PetscSFSetFromOptions(sf2));
6493   PetscCall(PetscMalloc1(nleaves, &iremote));
6494   p = 0;
6495   for (k = 0; k < nsend; k++) {
6496     PetscCheck(offsets[k] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Number of SF roots is too large for PetscInt");
6497     for (q = 0; q < nentries[k]; q++, p++) {
6498       iremote[p].rank = sendto[k];
6499       PetscCall(PetscIntCast(offsets[k] + q, &iremote[p].index));
6500     }
6501   }
6502   PetscCall(PetscSFSetGraph(sf2, nroots, nleaves, NULL, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
6503 
6504   /* Send the remote COOs to their owner */
6505   PetscInt    n2 = nroots, *i2, *j2; /* Buffers for received COOs from other ranks, along with a permutation array */
6506   PetscCount *perm2;                 /* Though PetscInt is enough for remote entries, we use PetscCount here as we want to reuse MatSplitEntries_Internal() */
6507   PetscCall(PetscMalloc3(n2, &i2, n2, &j2, n2, &perm2));
6508   PetscAssert(rem == 0 || i1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6509   PetscAssert(rem == 0 || j1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6510   PetscInt *i1prem = PetscSafePointerPlusOffset(i1, rem);
6511   PetscInt *j1prem = PetscSafePointerPlusOffset(j1, rem);
6512   PetscCall(PetscSFReduceWithMemTypeBegin(sf2, MPIU_INT, PETSC_MEMTYPE_HOST, i1prem, PETSC_MEMTYPE_HOST, i2, MPI_REPLACE));
6513   PetscCall(PetscSFReduceEnd(sf2, MPIU_INT, i1prem, i2, MPI_REPLACE));
6514   PetscCall(PetscSFReduceWithMemTypeBegin(sf2, MPIU_INT, PETSC_MEMTYPE_HOST, j1prem, PETSC_MEMTYPE_HOST, j2, MPI_REPLACE));
6515   PetscCall(PetscSFReduceEnd(sf2, MPIU_INT, j1prem, j2, MPI_REPLACE));
6516 
6517   PetscCall(PetscFree(offsets));
6518   PetscCall(PetscFree2(sendto, nentries));
6519 
6520   /* Sort received COOs by row along with the permutation array     */
6521   for (k = 0; k < n2; k++) perm2[k] = k;
6522   PetscCall(PetscSortIntWithIntCountArrayPair(n2, i2, j2, perm2));
6523 
6524   /* sf2 only sends contiguous leafdata to contiguous rootdata. We record the permutation which will be used to fill leafdata */
6525   PetscCount *Cperm1;
6526   PetscAssert(rem == 0 || perm1 != NULL, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cannot add nonzero offset to null");
6527   PetscCount *perm1prem = PetscSafePointerPlusOffset(perm1, rem);
6528   PetscCall(PetscMalloc1(nleaves, &Cperm1));
6529   PetscCall(PetscArraycpy(Cperm1, perm1prem, nleaves));
6530 
6531   /* Support for HYPRE matrices, kind of a hack.
6532      Swap min column with diagonal so that diagonal values will go first */
6533   PetscBool hypre;
6534   PetscCall(PetscStrcmp("_internal_COO_mat_for_hypre", ((PetscObject)mat)->name, &hypre));
6535   if (hypre) {
6536     PetscInt *minj;
6537     PetscBT   hasdiag;
6538 
6539     PetscCall(PetscBTCreate(m, &hasdiag));
6540     PetscCall(PetscMalloc1(m, &minj));
6541     for (k = 0; k < m; k++) minj[k] = PETSC_INT_MAX;
6542     for (k = i1start; k < rem; k++) {
6543       if (j1[k] < cstart || j1[k] >= cend) continue;
6544       const PetscInt rindex = i1[k] - rstart;
6545       if ((j1[k] - cstart) == rindex) PetscCall(PetscBTSet(hasdiag, rindex));
6546       minj[rindex] = PetscMin(minj[rindex], j1[k]);
6547     }
6548     for (k = 0; k < n2; k++) {
6549       if (j2[k] < cstart || j2[k] >= cend) continue;
6550       const PetscInt rindex = i2[k] - rstart;
6551       if ((j2[k] - cstart) == rindex) PetscCall(PetscBTSet(hasdiag, rindex));
6552       minj[rindex] = PetscMin(minj[rindex], j2[k]);
6553     }
6554     for (k = i1start; k < rem; k++) {
6555       const PetscInt rindex = i1[k] - rstart;
6556       if (j1[k] < cstart || j1[k] >= cend || !PetscBTLookup(hasdiag, rindex)) continue;
6557       if (j1[k] == minj[rindex]) j1[k] = i1[k] + (cstart - rstart);
6558       else if ((j1[k] - cstart) == rindex) j1[k] = minj[rindex];
6559     }
6560     for (k = 0; k < n2; k++) {
6561       const PetscInt rindex = i2[k] - rstart;
6562       if (j2[k] < cstart || j2[k] >= cend || !PetscBTLookup(hasdiag, rindex)) continue;
6563       if (j2[k] == minj[rindex]) j2[k] = i2[k] + (cstart - rstart);
6564       else if ((j2[k] - cstart) == rindex) j2[k] = minj[rindex];
6565     }
6566     PetscCall(PetscBTDestroy(&hasdiag));
6567     PetscCall(PetscFree(minj));
6568   }
6569 
6570   /* Split local COOs and received COOs into diag/offdiag portions */
6571   PetscCount *rowBegin1, *rowMid1, *rowEnd1;
6572   PetscCount *Ajmap1, *Aperm1, *Bjmap1, *Bperm1;
6573   PetscCount  Annz1, Bnnz1, Atot1, Btot1;
6574   PetscCount *rowBegin2, *rowMid2, *rowEnd2;
6575   PetscCount *Ajmap2, *Aperm2, *Bjmap2, *Bperm2;
6576   PetscCount  Annz2, Bnnz2, Atot2, Btot2;
6577 
6578   PetscCall(PetscCalloc3(m, &rowBegin1, m, &rowMid1, m, &rowEnd1));
6579   PetscCall(PetscCalloc3(m, &rowBegin2, m, &rowMid2, m, &rowEnd2));
6580   PetscCall(MatSplitEntries_Internal(mat, rem, i1, j1, perm1, rowBegin1, rowMid1, rowEnd1, &Atot1, &Aperm1, &Annz1, &Ajmap1, &Btot1, &Bperm1, &Bnnz1, &Bjmap1));
6581   PetscCall(MatSplitEntries_Internal(mat, n2, i2, j2, perm2, rowBegin2, rowMid2, rowEnd2, &Atot2, &Aperm2, &Annz2, &Ajmap2, &Btot2, &Bperm2, &Bnnz2, &Bjmap2));
6582 
6583   /* Merge local COOs with received COOs: diag with diag, offdiag with offdiag */
6584   PetscInt *Ai, *Bi;
6585   PetscInt *Aj, *Bj;
6586 
6587   PetscCall(PetscMalloc1(m + 1, &Ai));
6588   PetscCall(PetscMalloc1(m + 1, &Bi));
6589   PetscCall(PetscMalloc1(Annz1 + Annz2, &Aj)); /* Since local and remote entries might have dups, we might allocate excess memory */
6590   PetscCall(PetscMalloc1(Bnnz1 + Bnnz2, &Bj));
6591 
6592   PetscCount *Aimap1, *Bimap1, *Aimap2, *Bimap2;
6593   PetscCall(PetscMalloc1(Annz1, &Aimap1));
6594   PetscCall(PetscMalloc1(Bnnz1, &Bimap1));
6595   PetscCall(PetscMalloc1(Annz2, &Aimap2));
6596   PetscCall(PetscMalloc1(Bnnz2, &Bimap2));
6597 
6598   PetscCall(MatMergeEntries_Internal(mat, j1, j2, rowBegin1, rowMid1, rowBegin2, rowMid2, Ajmap1, Ajmap2, Aimap1, Aimap2, Ai, Aj));
6599   PetscCall(MatMergeEntries_Internal(mat, j1, j2, rowMid1, rowEnd1, rowMid2, rowEnd2, Bjmap1, Bjmap2, Bimap1, Bimap2, Bi, Bj));
6600 
6601   /* Expand Ajmap1/Bjmap1 to make them based off nonzeros in A/B, since we     */
6602   /* expect nonzeros in A/B most likely have local contributing entries        */
6603   PetscInt    Annz = Ai[m];
6604   PetscInt    Bnnz = Bi[m];
6605   PetscCount *Ajmap1_new, *Bjmap1_new;
6606 
6607   PetscCall(PetscMalloc1(Annz + 1, &Ajmap1_new));
6608   PetscCall(PetscMalloc1(Bnnz + 1, &Bjmap1_new));
6609 
6610   PetscCall(ExpandJmap_Internal(Annz1, Annz, Aimap1, Ajmap1, Ajmap1_new));
6611   PetscCall(ExpandJmap_Internal(Bnnz1, Bnnz, Bimap1, Bjmap1, Bjmap1_new));
6612 
6613   PetscCall(PetscFree(Aimap1));
6614   PetscCall(PetscFree(Ajmap1));
6615   PetscCall(PetscFree(Bimap1));
6616   PetscCall(PetscFree(Bjmap1));
6617   PetscCall(PetscFree3(rowBegin1, rowMid1, rowEnd1));
6618   PetscCall(PetscFree3(rowBegin2, rowMid2, rowEnd2));
6619   PetscCall(PetscFree(perm1));
6620   PetscCall(PetscFree3(i2, j2, perm2));
6621 
6622   Ajmap1 = Ajmap1_new;
6623   Bjmap1 = Bjmap1_new;
6624 
6625   /* Reallocate Aj, Bj once we know actual numbers of unique nonzeros in A and B */
6626   if (Annz < Annz1 + Annz2) {
6627     PetscInt *Aj_new;
6628     PetscCall(PetscMalloc1(Annz, &Aj_new));
6629     PetscCall(PetscArraycpy(Aj_new, Aj, Annz));
6630     PetscCall(PetscFree(Aj));
6631     Aj = Aj_new;
6632   }
6633 
6634   if (Bnnz < Bnnz1 + Bnnz2) {
6635     PetscInt *Bj_new;
6636     PetscCall(PetscMalloc1(Bnnz, &Bj_new));
6637     PetscCall(PetscArraycpy(Bj_new, Bj, Bnnz));
6638     PetscCall(PetscFree(Bj));
6639     Bj = Bj_new;
6640   }
6641 
6642   /* Create new submatrices for on-process and off-process coupling                  */
6643   PetscScalar     *Aa, *Ba;
6644   MatType          rtype;
6645   Mat_SeqAIJ      *a, *b;
6646   PetscObjectState state;
6647   PetscCall(PetscCalloc1(Annz, &Aa)); /* Zero matrix on device */
6648   PetscCall(PetscCalloc1(Bnnz, &Ba));
6649   /* make Aj[] local, i.e, based off the start column of the diagonal portion */
6650   if (cstart) {
6651     for (k = 0; k < Annz; k++) Aj[k] -= cstart;
6652   }
6653 
6654   PetscCall(MatGetRootType_Private(mat, &rtype));
6655 
6656   MatSeqXAIJGetOptions_Private(mpiaij->A);
6657   PetscCall(MatDestroy(&mpiaij->A));
6658   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, n, Ai, Aj, Aa, &mpiaij->A));
6659   PetscCall(MatSetBlockSizesFromMats(mpiaij->A, mat, mat));
6660   MatSeqXAIJRestoreOptions_Private(mpiaij->A);
6661 
6662   MatSeqXAIJGetOptions_Private(mpiaij->B);
6663   PetscCall(MatDestroy(&mpiaij->B));
6664   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, mat->cmap->N, Bi, Bj, Ba, &mpiaij->B));
6665   PetscCall(MatSetBlockSizesFromMats(mpiaij->B, mat, mat));
6666   MatSeqXAIJRestoreOptions_Private(mpiaij->B);
6667 
6668   PetscCall(MatSetUpMultiply_MPIAIJ(mat));
6669   mat->was_assembled = PETSC_TRUE; // was_assembled in effect means the Mvctx is built; doing so avoids redundant MatSetUpMultiply_MPIAIJ
6670   state              = mpiaij->A->nonzerostate + mpiaij->B->nonzerostate;
6671   PetscCallMPI(MPIU_Allreduce(&state, &mat->nonzerostate, 1, MPIU_INT64, MPI_SUM, PetscObjectComm((PetscObject)mat)));
6672 
6673   a          = (Mat_SeqAIJ *)mpiaij->A->data;
6674   b          = (Mat_SeqAIJ *)mpiaij->B->data;
6675   a->free_a  = PETSC_TRUE;
6676   a->free_ij = PETSC_TRUE;
6677   b->free_a  = PETSC_TRUE;
6678   b->free_ij = PETSC_TRUE;
6679   a->maxnz   = a->nz;
6680   b->maxnz   = b->nz;
6681 
6682   /* conversion must happen AFTER multiply setup */
6683   PetscCall(MatConvert(mpiaij->A, rtype, MAT_INPLACE_MATRIX, &mpiaij->A));
6684   PetscCall(MatConvert(mpiaij->B, rtype, MAT_INPLACE_MATRIX, &mpiaij->B));
6685   PetscCall(VecDestroy(&mpiaij->lvec));
6686   PetscCall(MatCreateVecs(mpiaij->B, &mpiaij->lvec, NULL));
6687 
6688   // Put the COO struct in a container and then attach that to the matrix
6689   PetscCall(PetscMalloc1(1, &coo));
6690   coo->n       = coo_n;
6691   coo->sf      = sf2;
6692   coo->sendlen = nleaves;
6693   coo->recvlen = nroots;
6694   coo->Annz    = Annz;
6695   coo->Bnnz    = Bnnz;
6696   coo->Annz2   = Annz2;
6697   coo->Bnnz2   = Bnnz2;
6698   coo->Atot1   = Atot1;
6699   coo->Atot2   = Atot2;
6700   coo->Btot1   = Btot1;
6701   coo->Btot2   = Btot2;
6702   coo->Ajmap1  = Ajmap1;
6703   coo->Aperm1  = Aperm1;
6704   coo->Bjmap1  = Bjmap1;
6705   coo->Bperm1  = Bperm1;
6706   coo->Aimap2  = Aimap2;
6707   coo->Ajmap2  = Ajmap2;
6708   coo->Aperm2  = Aperm2;
6709   coo->Bimap2  = Bimap2;
6710   coo->Bjmap2  = Bjmap2;
6711   coo->Bperm2  = Bperm2;
6712   coo->Cperm1  = Cperm1;
6713   // Allocate in preallocation. If not used, it has zero cost on host
6714   PetscCall(PetscMalloc2(coo->sendlen, &coo->sendbuf, coo->recvlen, &coo->recvbuf));
6715   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container));
6716   PetscCall(PetscContainerSetPointer(container, coo));
6717   PetscCall(PetscContainerSetCtxDestroy(container, MatCOOStructDestroy_MPIAIJ));
6718   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject)container));
6719   PetscCall(PetscContainerDestroy(&container));
6720   PetscFunctionReturn(PETSC_SUCCESS);
6721 }
6722 
6723 static PetscErrorCode MatSetValuesCOO_MPIAIJ(Mat mat, const PetscScalar v[], InsertMode imode)
6724 {
6725   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
6726   Mat                  A = mpiaij->A, B = mpiaij->B;
6727   PetscScalar         *Aa, *Ba;
6728   PetscScalar         *sendbuf, *recvbuf;
6729   const PetscCount    *Ajmap1, *Ajmap2, *Aimap2;
6730   const PetscCount    *Bjmap1, *Bjmap2, *Bimap2;
6731   const PetscCount    *Aperm1, *Aperm2, *Bperm1, *Bperm2;
6732   const PetscCount    *Cperm1;
6733   PetscContainer       container;
6734   MatCOOStruct_MPIAIJ *coo;
6735 
6736   PetscFunctionBegin;
6737   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container));
6738   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
6739   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
6740   sendbuf = coo->sendbuf;
6741   recvbuf = coo->recvbuf;
6742   Ajmap1  = coo->Ajmap1;
6743   Ajmap2  = coo->Ajmap2;
6744   Aimap2  = coo->Aimap2;
6745   Bjmap1  = coo->Bjmap1;
6746   Bjmap2  = coo->Bjmap2;
6747   Bimap2  = coo->Bimap2;
6748   Aperm1  = coo->Aperm1;
6749   Aperm2  = coo->Aperm2;
6750   Bperm1  = coo->Bperm1;
6751   Bperm2  = coo->Bperm2;
6752   Cperm1  = coo->Cperm1;
6753 
6754   PetscCall(MatSeqAIJGetArray(A, &Aa)); /* Might read and write matrix values */
6755   PetscCall(MatSeqAIJGetArray(B, &Ba));
6756 
6757   /* Pack entries to be sent to remote */
6758   for (PetscCount i = 0; i < coo->sendlen; i++) sendbuf[i] = v[Cperm1[i]];
6759 
6760   /* Send remote entries to their owner and overlap the communication with local computation */
6761   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_HOST, sendbuf, PETSC_MEMTYPE_HOST, recvbuf, MPI_REPLACE));
6762   /* Add local entries to A and B */
6763   for (PetscCount i = 0; i < coo->Annz; i++) { /* All nonzeros in A are either zero'ed or added with a value (i.e., initialized) */
6764     PetscScalar sum = 0.0;                     /* Do partial summation first to improve numerical stability */
6765     for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += v[Aperm1[k]];
6766     Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
6767   }
6768   for (PetscCount i = 0; i < coo->Bnnz; i++) {
6769     PetscScalar sum = 0.0;
6770     for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += v[Bperm1[k]];
6771     Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
6772   }
6773   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, sendbuf, recvbuf, MPI_REPLACE));
6774 
6775   /* Add received remote entries to A and B */
6776   for (PetscCount i = 0; i < coo->Annz2; i++) {
6777     for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += recvbuf[Aperm2[k]];
6778   }
6779   for (PetscCount i = 0; i < coo->Bnnz2; i++) {
6780     for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += recvbuf[Bperm2[k]];
6781   }
6782   PetscCall(MatSeqAIJRestoreArray(A, &Aa));
6783   PetscCall(MatSeqAIJRestoreArray(B, &Ba));
6784   PetscFunctionReturn(PETSC_SUCCESS);
6785 }
6786 
6787 /*MC
6788    MATMPIAIJ - MATMPIAIJ = "mpiaij" - A matrix type to be used for parallel sparse matrices.
6789 
6790    Options Database Keys:
6791 . -mat_type mpiaij - sets the matrix type to `MATMPIAIJ` during a call to `MatSetFromOptions()`
6792 
6793    Level: beginner
6794 
6795    Notes:
6796    `MatSetValues()` may be called for this matrix type with a `NULL` argument for the numerical values,
6797     in this case the values associated with the rows and columns one passes in are set to zero
6798     in the matrix
6799 
6800     `MatSetOptions`(,`MAT_STRUCTURE_ONLY`,`PETSC_TRUE`) may be called for this matrix type. In this no
6801     space is allocated for the nonzero entries and any entries passed with `MatSetValues()` are ignored
6802 
6803 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJ`, `MATAIJ`, `MatCreateAIJ()`
6804 M*/
6805 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJ(Mat B)
6806 {
6807   Mat_MPIAIJ *b;
6808   PetscMPIInt size;
6809 
6810   PetscFunctionBegin;
6811   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
6812 
6813   PetscCall(PetscNew(&b));
6814   B->data       = (void *)b;
6815   B->ops[0]     = MatOps_Values;
6816   B->assembled  = PETSC_FALSE;
6817   B->insertmode = NOT_SET_VALUES;
6818   b->size       = size;
6819 
6820   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)B), &b->rank));
6821 
6822   /* build cache for off array entries formed */
6823   PetscCall(MatStashCreate_Private(PetscObjectComm((PetscObject)B), 1, &B->stash));
6824 
6825   b->donotstash  = PETSC_FALSE;
6826   b->colmap      = NULL;
6827   b->garray      = NULL;
6828   b->roworiented = PETSC_TRUE;
6829 
6830   /* stuff used for matrix vector multiply */
6831   b->lvec  = NULL;
6832   b->Mvctx = NULL;
6833 
6834   /* stuff for MatGetRow() */
6835   b->rowindices   = NULL;
6836   b->rowvalues    = NULL;
6837   b->getrowactive = PETSC_FALSE;
6838 
6839   /* flexible pointer used in CUSPARSE classes */
6840   b->spptr = NULL;
6841 
6842   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetUseScalableIncreaseOverlap_C", MatMPIAIJSetUseScalableIncreaseOverlap_MPIAIJ));
6843   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_MPIAIJ));
6844   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_MPIAIJ));
6845   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatIsTranspose_C", MatIsTranspose_MPIAIJ));
6846   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJ));
6847   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatResetPreallocation_C", MatResetPreallocation_MPIAIJ));
6848   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatResetHash_C", MatResetHash_MPIAIJ));
6849   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatMPIAIJSetPreallocationCSR_C", MatMPIAIJSetPreallocationCSR_MPIAIJ));
6850   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatDiagonalScaleLocal_C", MatDiagonalScaleLocal_MPIAIJ));
6851   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijperm_C", MatConvert_MPIAIJ_MPIAIJPERM));
6852   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijsell_C", MatConvert_MPIAIJ_MPIAIJSELL));
6853 #if defined(PETSC_HAVE_CUDA)
6854   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijcusparse_C", MatConvert_MPIAIJ_MPIAIJCUSPARSE));
6855 #endif
6856 #if defined(PETSC_HAVE_HIP)
6857   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijhipsparse_C", MatConvert_MPIAIJ_MPIAIJHIPSPARSE));
6858 #endif
6859 #if defined(PETSC_HAVE_KOKKOS_KERNELS)
6860   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijkokkos_C", MatConvert_MPIAIJ_MPIAIJKokkos));
6861 #endif
6862 #if defined(PETSC_HAVE_MKL_SPARSE)
6863   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijmkl_C", MatConvert_MPIAIJ_MPIAIJMKL));
6864 #endif
6865   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpiaijcrl_C", MatConvert_MPIAIJ_MPIAIJCRL));
6866   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpibaij_C", MatConvert_MPIAIJ_MPIBAIJ));
6867   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpisbaij_C", MatConvert_MPIAIJ_MPISBAIJ));
6868   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpidense_C", MatConvert_MPIAIJ_MPIDense));
6869 #if defined(PETSC_HAVE_ELEMENTAL)
6870   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_elemental_C", MatConvert_MPIAIJ_Elemental));
6871 #endif
6872 #if defined(PETSC_HAVE_SCALAPACK)
6873   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_scalapack_C", MatConvert_AIJ_ScaLAPACK));
6874 #endif
6875   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_is_C", MatConvert_XAIJ_IS));
6876   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_mpisell_C", MatConvert_MPIAIJ_MPISELL));
6877 #if defined(PETSC_HAVE_HYPRE)
6878   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_mpiaij_hypre_C", MatConvert_AIJ_HYPRE));
6879   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_transpose_mpiaij_mpiaij_C", MatProductSetFromOptions_Transpose_AIJ_AIJ));
6880 #endif
6881   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_is_mpiaij_C", MatProductSetFromOptions_IS_XAIJ));
6882   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatProductSetFromOptions_mpiaij_mpiaij_C", MatProductSetFromOptions_MPIAIJ));
6883   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJ));
6884   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJ));
6885   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATMPIAIJ));
6886   PetscFunctionReturn(PETSC_SUCCESS);
6887 }
6888 
6889 /*@
6890   MatCreateMPIAIJWithSplitArrays - creates a `MATMPIAIJ` matrix using arrays that contain the "diagonal"
6891   and "off-diagonal" part of the matrix in CSR format.
6892 
6893   Collective
6894 
6895   Input Parameters:
6896 + comm - MPI communicator
6897 . m    - number of local rows (Cannot be `PETSC_DECIDE`)
6898 . n    - This value should be the same as the local size used in creating the
6899          x vector for the matrix-vector product $y = Ax$. (or `PETSC_DECIDE` to have
6900          calculated if `N` is given) For square matrices `n` is almost always `m`.
6901 . M    - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
6902 . N    - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
6903 . i    - row indices for "diagonal" portion of matrix; that is i[0] = 0, i[row] = i[row-1] + number of elements in that row of the matrix
6904 . j    - column indices, which must be local, i.e., based off the start column of the diagonal portion
6905 . a    - matrix values
6906 . oi   - row indices for "off-diagonal" portion of matrix; that is oi[0] = 0, oi[row] = oi[row-1] + number of elements in that row of the matrix
6907 . oj   - column indices, which must be global, representing global columns in the `MATMPIAIJ` matrix
6908 - oa   - matrix values
6909 
6910   Output Parameter:
6911 . mat - the matrix
6912 
6913   Level: advanced
6914 
6915   Notes:
6916   The `i`, `j`, and `a` arrays ARE NOT copied by this routine into the internal format used by PETSc (even in Fortran). The user
6917   must free the arrays once the matrix has been destroyed and not before.
6918 
6919   The `i` and `j` indices are 0 based
6920 
6921   See `MatCreateAIJ()` for the definition of "diagonal" and "off-diagonal" portion of the matrix
6922 
6923   This sets local rows and cannot be used to set off-processor values.
6924 
6925   Use of this routine is discouraged because it is inflexible and cumbersome to use. It is extremely rare that a
6926   legacy application natively assembles into exactly this split format. The code to do so is nontrivial and does
6927   not easily support in-place reassembly. It is recommended to use MatSetValues() (or a variant thereof) because
6928   the resulting assembly is easier to implement, will work with any matrix format, and the user does not have to
6929   keep track of the underlying array. Use `MatSetOption`(A,`MAT_NO_OFF_PROC_ENTRIES`,`PETSC_TRUE`) to disable all
6930   communication if it is known that only local entries will be set.
6931 
6932 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatMPIAIJSetPreallocation()`, `MatMPIAIJSetPreallocationCSR()`,
6933           `MATMPIAIJ`, `MatCreateAIJ()`, `MatCreateMPIAIJWithArrays()`
6934 @*/
6935 PetscErrorCode MatCreateMPIAIJWithSplitArrays(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt i[], PetscInt j[], PetscScalar a[], PetscInt oi[], PetscInt oj[], PetscScalar oa[], Mat *mat)
6936 {
6937   Mat_MPIAIJ *maij;
6938 
6939   PetscFunctionBegin;
6940   PetscCheck(m >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "local number of rows (m) cannot be PETSC_DECIDE, or negative");
6941   PetscCheck(i[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
6942   PetscCheck(oi[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "oi (row indices) must start with 0");
6943   PetscCall(MatCreate(comm, mat));
6944   PetscCall(MatSetSizes(*mat, m, n, M, N));
6945   PetscCall(MatSetType(*mat, MATMPIAIJ));
6946   maij = (Mat_MPIAIJ *)(*mat)->data;
6947 
6948   (*mat)->preallocated = PETSC_TRUE;
6949 
6950   PetscCall(PetscLayoutSetUp((*mat)->rmap));
6951   PetscCall(PetscLayoutSetUp((*mat)->cmap));
6952 
6953   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, n, i, j, a, &maij->A));
6954   PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, m, (*mat)->cmap->N, oi, oj, oa, &maij->B));
6955 
6956   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE));
6957   PetscCall(MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY));
6958   PetscCall(MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY));
6959   PetscCall(MatSetOption(*mat, MAT_NO_OFF_PROC_ENTRIES, PETSC_FALSE));
6960   PetscCall(MatSetOption(*mat, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
6961   PetscFunctionReturn(PETSC_SUCCESS);
6962 }
6963 
6964 typedef struct {
6965   Mat       *mp;    /* intermediate products */
6966   PetscBool *mptmp; /* is the intermediate product temporary ? */
6967   PetscInt   cp;    /* number of intermediate products */
6968 
6969   /* support for MatGetBrowsOfAoCols_MPIAIJ for P_oth */
6970   PetscInt    *startsj_s, *startsj_r;
6971   PetscScalar *bufa;
6972   Mat          P_oth;
6973 
6974   /* may take advantage of merging product->B */
6975   Mat Bloc; /* B-local by merging diag and off-diag */
6976 
6977   /* cusparse does not have support to split between symbolic and numeric phases.
6978      When api_user is true, we don't need to update the numerical values
6979      of the temporary storage */
6980   PetscBool reusesym;
6981 
6982   /* support for COO values insertion */
6983   PetscScalar *coo_v, *coo_w; /* store on-process and off-process COO scalars, and used as MPI recv/send buffers respectively */
6984   PetscInt   **own;           /* own[i] points to address of on-process COO indices for Mat mp[i] */
6985   PetscInt   **off;           /* off[i] points to address of off-process COO indices for Mat mp[i] */
6986   PetscBool    hasoffproc;    /* if true, have off-process values insertion (i.e. AtB or PtAP) */
6987   PetscSF      sf;            /* used for non-local values insertion and memory malloc */
6988   PetscMemType mtype;
6989 
6990   /* customization */
6991   PetscBool abmerge;
6992   PetscBool P_oth_bind;
6993 } MatMatMPIAIJBACKEND;
6994 
6995 static PetscErrorCode MatDestroy_MatMatMPIAIJBACKEND(void *data)
6996 {
6997   MatMatMPIAIJBACKEND *mmdata = (MatMatMPIAIJBACKEND *)data;
6998   PetscInt             i;
6999 
7000   PetscFunctionBegin;
7001   PetscCall(PetscFree2(mmdata->startsj_s, mmdata->startsj_r));
7002   PetscCall(PetscFree(mmdata->bufa));
7003   PetscCall(PetscSFFree(mmdata->sf, mmdata->mtype, mmdata->coo_v));
7004   PetscCall(PetscSFFree(mmdata->sf, mmdata->mtype, mmdata->coo_w));
7005   PetscCall(MatDestroy(&mmdata->P_oth));
7006   PetscCall(MatDestroy(&mmdata->Bloc));
7007   PetscCall(PetscSFDestroy(&mmdata->sf));
7008   for (i = 0; i < mmdata->cp; i++) PetscCall(MatDestroy(&mmdata->mp[i]));
7009   PetscCall(PetscFree2(mmdata->mp, mmdata->mptmp));
7010   PetscCall(PetscFree(mmdata->own[0]));
7011   PetscCall(PetscFree(mmdata->own));
7012   PetscCall(PetscFree(mmdata->off[0]));
7013   PetscCall(PetscFree(mmdata->off));
7014   PetscCall(PetscFree(mmdata));
7015   PetscFunctionReturn(PETSC_SUCCESS);
7016 }
7017 
7018 /* Copy selected n entries with indices in idx[] of A to v[].
7019    If idx is NULL, copy the whole data array of A to v[]
7020  */
7021 static PetscErrorCode MatSeqAIJCopySubArray(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
7022 {
7023   PetscErrorCode (*f)(Mat, PetscInt, const PetscInt[], PetscScalar[]);
7024 
7025   PetscFunctionBegin;
7026   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", &f));
7027   if (f) {
7028     PetscCall((*f)(A, n, idx, v));
7029   } else {
7030     const PetscScalar *vv;
7031 
7032     PetscCall(MatSeqAIJGetArrayRead(A, &vv));
7033     if (n && idx) {
7034       PetscScalar    *w  = v;
7035       const PetscInt *oi = idx;
7036       PetscInt        j;
7037 
7038       for (j = 0; j < n; j++) *w++ = vv[*oi++];
7039     } else {
7040       PetscCall(PetscArraycpy(v, vv, n));
7041     }
7042     PetscCall(MatSeqAIJRestoreArrayRead(A, &vv));
7043   }
7044   PetscFunctionReturn(PETSC_SUCCESS);
7045 }
7046 
7047 static PetscErrorCode MatProductNumeric_MPIAIJBACKEND(Mat C)
7048 {
7049   MatMatMPIAIJBACKEND *mmdata;
7050   PetscInt             i, n_d, n_o;
7051 
7052   PetscFunctionBegin;
7053   MatCheckProduct(C, 1);
7054   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
7055   mmdata = (MatMatMPIAIJBACKEND *)C->product->data;
7056   if (!mmdata->reusesym) { /* update temporary matrices */
7057     if (mmdata->P_oth) PetscCall(MatGetBrowsOfAoCols_MPIAIJ(C->product->A, C->product->B, MAT_REUSE_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7058     if (mmdata->Bloc) PetscCall(MatMPIAIJGetLocalMatMerge(C->product->B, MAT_REUSE_MATRIX, NULL, &mmdata->Bloc));
7059   }
7060   mmdata->reusesym = PETSC_FALSE;
7061 
7062   for (i = 0; i < mmdata->cp; i++) {
7063     PetscCheck(mmdata->mp[i]->ops->productnumeric, PetscObjectComm((PetscObject)mmdata->mp[i]), PETSC_ERR_PLIB, "Missing numeric op for %s", MatProductTypes[mmdata->mp[i]->product->type]);
7064     PetscCall((*mmdata->mp[i]->ops->productnumeric)(mmdata->mp[i]));
7065   }
7066   for (i = 0, n_d = 0, n_o = 0; i < mmdata->cp; i++) {
7067     PetscInt noff;
7068 
7069     PetscCall(PetscIntCast(mmdata->off[i + 1] - mmdata->off[i], &noff));
7070     if (mmdata->mptmp[i]) continue;
7071     if (noff) {
7072       PetscInt nown;
7073 
7074       PetscCall(PetscIntCast(mmdata->own[i + 1] - mmdata->own[i], &nown));
7075       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], noff, mmdata->off[i], mmdata->coo_w + n_o));
7076       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], nown, mmdata->own[i], mmdata->coo_v + n_d));
7077       n_o += noff;
7078       n_d += nown;
7079     } else {
7080       Mat_SeqAIJ *mm = (Mat_SeqAIJ *)mmdata->mp[i]->data;
7081 
7082       PetscCall(MatSeqAIJCopySubArray(mmdata->mp[i], mm->nz, NULL, mmdata->coo_v + n_d));
7083       n_d += mm->nz;
7084     }
7085   }
7086   if (mmdata->hasoffproc) { /* offprocess insertion */
7087     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_SCALAR, mmdata->coo_w, mmdata->coo_v + n_d));
7088     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_SCALAR, mmdata->coo_w, mmdata->coo_v + n_d));
7089   }
7090   PetscCall(MatSetValuesCOO(C, mmdata->coo_v, INSERT_VALUES));
7091   PetscFunctionReturn(PETSC_SUCCESS);
7092 }
7093 
7094 /* Support for Pt * A, A * P, or Pt * A * P */
7095 #define MAX_NUMBER_INTERMEDIATE 4
7096 PetscErrorCode MatProductSymbolic_MPIAIJBACKEND(Mat C)
7097 {
7098   Mat_Product           *product = C->product;
7099   Mat                    A, P, mp[MAX_NUMBER_INTERMEDIATE]; /* A, P and a series of intermediate matrices */
7100   Mat_MPIAIJ            *a, *p;
7101   MatMatMPIAIJBACKEND   *mmdata;
7102   ISLocalToGlobalMapping P_oth_l2g = NULL;
7103   IS                     glob      = NULL;
7104   const char            *prefix;
7105   char                   pprefix[256];
7106   const PetscInt        *globidx, *P_oth_idx;
7107   PetscInt               i, j, cp, m, n, M, N, *coo_i, *coo_j;
7108   PetscCount             ncoo, ncoo_d, ncoo_o, ncoo_oown;
7109   PetscInt               cmapt[MAX_NUMBER_INTERMEDIATE], rmapt[MAX_NUMBER_INTERMEDIATE]; /* col/row map type for each Mat in mp[]. */
7110                                                                                          /* type-0: consecutive, start from 0; type-1: consecutive with */
7111                                                                                          /* a base offset; type-2: sparse with a local to global map table */
7112   const PetscInt *cmapa[MAX_NUMBER_INTERMEDIATE], *rmapa[MAX_NUMBER_INTERMEDIATE];       /* col/row local to global map array (table) for type-2 map type */
7113 
7114   MatProductType ptype;
7115   PetscBool      mptmp[MAX_NUMBER_INTERMEDIATE], hasoffproc = PETSC_FALSE, iscuda, iship, iskokk;
7116   PetscMPIInt    size;
7117 
7118   PetscFunctionBegin;
7119   MatCheckProduct(C, 1);
7120   PetscCheck(!product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty");
7121   ptype = product->type;
7122   if (product->A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
7123     ptype                                          = MATPRODUCT_AB;
7124     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
7125   }
7126   switch (ptype) {
7127   case MATPRODUCT_AB:
7128     A          = product->A;
7129     P          = product->B;
7130     m          = A->rmap->n;
7131     n          = P->cmap->n;
7132     M          = A->rmap->N;
7133     N          = P->cmap->N;
7134     hasoffproc = PETSC_FALSE; /* will not scatter mat product values to other processes */
7135     break;
7136   case MATPRODUCT_AtB:
7137     P          = product->A;
7138     A          = product->B;
7139     m          = P->cmap->n;
7140     n          = A->cmap->n;
7141     M          = P->cmap->N;
7142     N          = A->cmap->N;
7143     hasoffproc = PETSC_TRUE;
7144     break;
7145   case MATPRODUCT_PtAP:
7146     A          = product->A;
7147     P          = product->B;
7148     m          = P->cmap->n;
7149     n          = P->cmap->n;
7150     M          = P->cmap->N;
7151     N          = P->cmap->N;
7152     hasoffproc = PETSC_TRUE;
7153     break;
7154   default:
7155     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
7156   }
7157   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)C), &size));
7158   if (size == 1) hasoffproc = PETSC_FALSE;
7159 
7160   /* defaults */
7161   for (i = 0; i < MAX_NUMBER_INTERMEDIATE; i++) {
7162     mp[i]    = NULL;
7163     mptmp[i] = PETSC_FALSE;
7164     rmapt[i] = -1;
7165     cmapt[i] = -1;
7166     rmapa[i] = NULL;
7167     cmapa[i] = NULL;
7168   }
7169 
7170   /* customization */
7171   PetscCall(PetscNew(&mmdata));
7172   mmdata->reusesym = product->api_user;
7173   if (ptype == MATPRODUCT_AB) {
7174     if (product->api_user) {
7175       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat");
7176       PetscCall(PetscOptionsBool("-matmatmult_backend_mergeB", "Merge product->B local matrices", "MatMatMult", mmdata->abmerge, &mmdata->abmerge, NULL));
7177       PetscCall(PetscOptionsBool("-matmatmult_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7178       PetscOptionsEnd();
7179     } else {
7180       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat");
7181       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_mergeB", "Merge product->B local matrices", "MatMatMult", mmdata->abmerge, &mmdata->abmerge, NULL));
7182       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7183       PetscOptionsEnd();
7184     }
7185   } else if (ptype == MATPRODUCT_PtAP) {
7186     if (product->api_user) {
7187       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat");
7188       PetscCall(PetscOptionsBool("-matptap_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7189       PetscOptionsEnd();
7190     } else {
7191       PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat");
7192       PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_pothbind", "Bind P_oth to CPU", "MatBindToCPU", mmdata->P_oth_bind, &mmdata->P_oth_bind, NULL));
7193       PetscOptionsEnd();
7194     }
7195   }
7196   a = (Mat_MPIAIJ *)A->data;
7197   p = (Mat_MPIAIJ *)P->data;
7198   PetscCall(MatSetSizes(C, m, n, M, N));
7199   PetscCall(PetscLayoutSetUp(C->rmap));
7200   PetscCall(PetscLayoutSetUp(C->cmap));
7201   PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
7202   PetscCall(MatGetOptionsPrefix(C, &prefix));
7203 
7204   cp = 0;
7205   switch (ptype) {
7206   case MATPRODUCT_AB: /* A * P */
7207     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7208 
7209     /* A_diag * P_local (merged or not) */
7210     if (mmdata->abmerge) { /* P's diagonal and off-diag blocks are merged to one matrix, then multiplied by A_diag */
7211       /* P is product->B */
7212       PetscCall(MatMPIAIJGetLocalMatMerge(P, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7213       PetscCall(MatProductCreate(a->A, mmdata->Bloc, NULL, &mp[cp]));
7214       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7215       PetscCall(MatProductSetFill(mp[cp], product->fill));
7216       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7217       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7218       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7219       mp[cp]->product->api_user = product->api_user;
7220       PetscCall(MatProductSetFromOptions(mp[cp]));
7221       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7222       PetscCall(ISGetIndices(glob, &globidx));
7223       rmapt[cp] = 1;
7224       cmapt[cp] = 2;
7225       cmapa[cp] = globidx;
7226       mptmp[cp] = PETSC_FALSE;
7227       cp++;
7228     } else { /* A_diag * P_diag and A_diag * P_off */
7229       PetscCall(MatProductCreate(a->A, p->A, NULL, &mp[cp]));
7230       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7231       PetscCall(MatProductSetFill(mp[cp], product->fill));
7232       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7233       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7234       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7235       mp[cp]->product->api_user = product->api_user;
7236       PetscCall(MatProductSetFromOptions(mp[cp]));
7237       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7238       rmapt[cp] = 1;
7239       cmapt[cp] = 1;
7240       mptmp[cp] = PETSC_FALSE;
7241       cp++;
7242       PetscCall(MatProductCreate(a->A, p->B, NULL, &mp[cp]));
7243       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7244       PetscCall(MatProductSetFill(mp[cp], product->fill));
7245       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7246       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7247       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7248       mp[cp]->product->api_user = product->api_user;
7249       PetscCall(MatProductSetFromOptions(mp[cp]));
7250       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7251       rmapt[cp] = 1;
7252       cmapt[cp] = 2;
7253       cmapa[cp] = p->garray;
7254       mptmp[cp] = PETSC_FALSE;
7255       cp++;
7256     }
7257 
7258     /* A_off * P_other */
7259     if (mmdata->P_oth) {
7260       PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth, &P_oth_l2g)); /* make P_oth use local col ids */
7261       PetscCall(ISLocalToGlobalMappingGetIndices(P_oth_l2g, &P_oth_idx));
7262       PetscCall(MatSetType(mmdata->P_oth, ((PetscObject)a->B)->type_name));
7263       PetscCall(MatBindToCPU(mmdata->P_oth, mmdata->P_oth_bind));
7264       PetscCall(MatProductCreate(a->B, mmdata->P_oth, NULL, &mp[cp]));
7265       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7266       PetscCall(MatProductSetFill(mp[cp], product->fill));
7267       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7268       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7269       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7270       mp[cp]->product->api_user = product->api_user;
7271       PetscCall(MatProductSetFromOptions(mp[cp]));
7272       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7273       rmapt[cp] = 1;
7274       cmapt[cp] = 2;
7275       cmapa[cp] = P_oth_idx;
7276       mptmp[cp] = PETSC_FALSE;
7277       cp++;
7278     }
7279     break;
7280 
7281   case MATPRODUCT_AtB: /* (P^t * A): P_diag * A_loc + P_off * A_loc */
7282     /* A is product->B */
7283     PetscCall(MatMPIAIJGetLocalMatMerge(A, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7284     if (A == P) { /* when A==P, we can take advantage of the already merged mmdata->Bloc */
7285       PetscCall(MatProductCreate(mmdata->Bloc, mmdata->Bloc, NULL, &mp[cp]));
7286       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7287       PetscCall(MatProductSetFill(mp[cp], product->fill));
7288       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7289       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7290       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7291       mp[cp]->product->api_user = product->api_user;
7292       PetscCall(MatProductSetFromOptions(mp[cp]));
7293       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7294       PetscCall(ISGetIndices(glob, &globidx));
7295       rmapt[cp] = 2;
7296       rmapa[cp] = globidx;
7297       cmapt[cp] = 2;
7298       cmapa[cp] = globidx;
7299       mptmp[cp] = PETSC_FALSE;
7300       cp++;
7301     } else {
7302       PetscCall(MatProductCreate(p->A, mmdata->Bloc, NULL, &mp[cp]));
7303       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7304       PetscCall(MatProductSetFill(mp[cp], product->fill));
7305       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7306       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7307       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7308       mp[cp]->product->api_user = product->api_user;
7309       PetscCall(MatProductSetFromOptions(mp[cp]));
7310       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7311       PetscCall(ISGetIndices(glob, &globidx));
7312       rmapt[cp] = 1;
7313       cmapt[cp] = 2;
7314       cmapa[cp] = globidx;
7315       mptmp[cp] = PETSC_FALSE;
7316       cp++;
7317       PetscCall(MatProductCreate(p->B, mmdata->Bloc, NULL, &mp[cp]));
7318       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7319       PetscCall(MatProductSetFill(mp[cp], product->fill));
7320       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7321       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7322       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7323       mp[cp]->product->api_user = product->api_user;
7324       PetscCall(MatProductSetFromOptions(mp[cp]));
7325       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7326       rmapt[cp] = 2;
7327       rmapa[cp] = p->garray;
7328       cmapt[cp] = 2;
7329       cmapa[cp] = globidx;
7330       mptmp[cp] = PETSC_FALSE;
7331       cp++;
7332     }
7333     break;
7334   case MATPRODUCT_PtAP:
7335     PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &mmdata->startsj_s, &mmdata->startsj_r, &mmdata->bufa, &mmdata->P_oth));
7336     /* P is product->B */
7337     PetscCall(MatMPIAIJGetLocalMatMerge(P, MAT_INITIAL_MATRIX, &glob, &mmdata->Bloc));
7338     PetscCall(MatProductCreate(a->A, mmdata->Bloc, NULL, &mp[cp]));
7339     PetscCall(MatProductSetType(mp[cp], MATPRODUCT_PtAP));
7340     PetscCall(MatProductSetFill(mp[cp], product->fill));
7341     PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7342     PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7343     PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7344     mp[cp]->product->api_user = product->api_user;
7345     PetscCall(MatProductSetFromOptions(mp[cp]));
7346     PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7347     PetscCall(ISGetIndices(glob, &globidx));
7348     rmapt[cp] = 2;
7349     rmapa[cp] = globidx;
7350     cmapt[cp] = 2;
7351     cmapa[cp] = globidx;
7352     mptmp[cp] = PETSC_FALSE;
7353     cp++;
7354     if (mmdata->P_oth) {
7355       PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth, &P_oth_l2g));
7356       PetscCall(ISLocalToGlobalMappingGetIndices(P_oth_l2g, &P_oth_idx));
7357       PetscCall(MatSetType(mmdata->P_oth, ((PetscObject)a->B)->type_name));
7358       PetscCall(MatBindToCPU(mmdata->P_oth, mmdata->P_oth_bind));
7359       PetscCall(MatProductCreate(a->B, mmdata->P_oth, NULL, &mp[cp]));
7360       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AB));
7361       PetscCall(MatProductSetFill(mp[cp], product->fill));
7362       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7363       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7364       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7365       mp[cp]->product->api_user = product->api_user;
7366       PetscCall(MatProductSetFromOptions(mp[cp]));
7367       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7368       mptmp[cp] = PETSC_TRUE;
7369       cp++;
7370       PetscCall(MatProductCreate(mmdata->Bloc, mp[1], NULL, &mp[cp]));
7371       PetscCall(MatProductSetType(mp[cp], MATPRODUCT_AtB));
7372       PetscCall(MatProductSetFill(mp[cp], product->fill));
7373       PetscCall(PetscSNPrintf(pprefix, sizeof(pprefix), "backend_p%" PetscInt_FMT "_", cp));
7374       PetscCall(MatSetOptionsPrefix(mp[cp], prefix));
7375       PetscCall(MatAppendOptionsPrefix(mp[cp], pprefix));
7376       mp[cp]->product->api_user = product->api_user;
7377       PetscCall(MatProductSetFromOptions(mp[cp]));
7378       PetscCall((*mp[cp]->ops->productsymbolic)(mp[cp]));
7379       rmapt[cp] = 2;
7380       rmapa[cp] = globidx;
7381       cmapt[cp] = 2;
7382       cmapa[cp] = P_oth_idx;
7383       mptmp[cp] = PETSC_FALSE;
7384       cp++;
7385     }
7386     break;
7387   default:
7388     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for product type %s", MatProductTypes[ptype]);
7389   }
7390   /* sanity check */
7391   if (size > 1)
7392     for (i = 0; i < cp; i++) PetscCheck(rmapt[i] != 2 || hasoffproc, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected offproc map type for product %" PetscInt_FMT, i);
7393 
7394   PetscCall(PetscMalloc2(cp, &mmdata->mp, cp, &mmdata->mptmp));
7395   for (i = 0; i < cp; i++) {
7396     mmdata->mp[i]    = mp[i];
7397     mmdata->mptmp[i] = mptmp[i];
7398   }
7399   mmdata->cp             = cp;
7400   C->product->data       = mmdata;
7401   C->product->destroy    = MatDestroy_MatMatMPIAIJBACKEND;
7402   C->ops->productnumeric = MatProductNumeric_MPIAIJBACKEND;
7403 
7404   /* memory type */
7405   mmdata->mtype = PETSC_MEMTYPE_HOST;
7406   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iscuda, MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, ""));
7407   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iship, MATSEQAIJHIPSPARSE, MATMPIAIJHIPSPARSE, ""));
7408   PetscCall(PetscObjectTypeCompareAny((PetscObject)C, &iskokk, MATSEQAIJKOKKOS, MATMPIAIJKOKKOS, ""));
7409   if (iscuda) mmdata->mtype = PETSC_MEMTYPE_CUDA;
7410   else if (iship) mmdata->mtype = PETSC_MEMTYPE_HIP;
7411   else if (iskokk) mmdata->mtype = PETSC_MEMTYPE_KOKKOS;
7412 
7413   /* prepare coo coordinates for values insertion */
7414 
7415   /* count total nonzeros of those intermediate seqaij Mats
7416     ncoo_d:    # of nonzeros of matrices that do not have offproc entries
7417     ncoo_o:    # of nonzeros (of matrices that might have offproc entries) that will be inserted to remote procs
7418     ncoo_oown: # of nonzeros (of matrices that might have offproc entries) that will be inserted locally
7419   */
7420   for (cp = 0, ncoo_d = 0, ncoo_o = 0, ncoo_oown = 0; cp < mmdata->cp; cp++) {
7421     Mat_SeqAIJ *mm = (Mat_SeqAIJ *)mp[cp]->data;
7422     if (mptmp[cp]) continue;
7423     if (rmapt[cp] == 2 && hasoffproc) { /* the rows need to be scatter to all processes (might include self) */
7424       const PetscInt *rmap = rmapa[cp];
7425       const PetscInt  mr   = mp[cp]->rmap->n;
7426       const PetscInt  rs   = C->rmap->rstart;
7427       const PetscInt  re   = C->rmap->rend;
7428       const PetscInt *ii   = mm->i;
7429       for (i = 0; i < mr; i++) {
7430         const PetscInt gr = rmap[i];
7431         const PetscInt nz = ii[i + 1] - ii[i];
7432         if (gr < rs || gr >= re) ncoo_o += nz; /* this row is offproc */
7433         else ncoo_oown += nz;                  /* this row is local */
7434       }
7435     } else ncoo_d += mm->nz;
7436   }
7437 
7438   /*
7439     ncoo: total number of nonzeros (including those inserted by remote procs) belonging to this proc
7440 
7441     ncoo = ncoo_d + ncoo_oown + ncoo2, which ncoo2 is number of nonzeros inserted to me by other procs.
7442 
7443     off[0] points to a big index array, which is shared by off[1,2,...]. Similarly, for own[0].
7444 
7445     off[p]: points to the segment for matrix mp[p], storing location of nonzeros that mp[p] will insert to others
7446     own[p]: points to the segment for matrix mp[p], storing location of nonzeros that mp[p] will insert locally
7447     so, off[p+1]-off[p] is the number of nonzeros that mp[p] will send to others.
7448 
7449     coo_i/j/v[]: [ncoo] row/col/val of nonzeros belonging to this proc.
7450     Ex. coo_i[]: the beginning part (of size ncoo_d + ncoo_oown) stores i of local nonzeros, and the remaining part stores i of nonzeros I will receive.
7451   */
7452   PetscCall(PetscCalloc1(mmdata->cp + 1, &mmdata->off)); /* +1 to make a csr-like data structure */
7453   PetscCall(PetscCalloc1(mmdata->cp + 1, &mmdata->own));
7454 
7455   /* gather (i,j) of nonzeros inserted by remote procs */
7456   if (hasoffproc) {
7457     PetscSF  msf;
7458     PetscInt ncoo2, *coo_i2, *coo_j2;
7459 
7460     PetscCall(PetscMalloc1(ncoo_o, &mmdata->off[0]));
7461     PetscCall(PetscMalloc1(ncoo_oown, &mmdata->own[0]));
7462     PetscCall(PetscMalloc2(ncoo_o, &coo_i, ncoo_o, &coo_j)); /* to collect (i,j) of entries to be sent to others */
7463 
7464     for (cp = 0, ncoo_o = 0; cp < mmdata->cp; cp++) {
7465       Mat_SeqAIJ *mm     = (Mat_SeqAIJ *)mp[cp]->data;
7466       PetscInt   *idxoff = mmdata->off[cp];
7467       PetscInt   *idxown = mmdata->own[cp];
7468       if (!mptmp[cp] && rmapt[cp] == 2) { /* row map is sparse */
7469         const PetscInt *rmap = rmapa[cp];
7470         const PetscInt *cmap = cmapa[cp];
7471         const PetscInt *ii   = mm->i;
7472         PetscInt       *coi  = coo_i + ncoo_o;
7473         PetscInt       *coj  = coo_j + ncoo_o;
7474         const PetscInt  mr   = mp[cp]->rmap->n;
7475         const PetscInt  rs   = C->rmap->rstart;
7476         const PetscInt  re   = C->rmap->rend;
7477         const PetscInt  cs   = C->cmap->rstart;
7478         for (i = 0; i < mr; i++) {
7479           const PetscInt *jj = mm->j + ii[i];
7480           const PetscInt  gr = rmap[i];
7481           const PetscInt  nz = ii[i + 1] - ii[i];
7482           if (gr < rs || gr >= re) { /* this is an offproc row */
7483             for (j = ii[i]; j < ii[i + 1]; j++) {
7484               *coi++    = gr;
7485               *idxoff++ = j;
7486             }
7487             if (!cmapt[cp]) { /* already global */
7488               for (j = 0; j < nz; j++) *coj++ = jj[j];
7489             } else if (cmapt[cp] == 1) { /* local to global for owned columns of C */
7490               for (j = 0; j < nz; j++) *coj++ = jj[j] + cs;
7491             } else { /* offdiag */
7492               for (j = 0; j < nz; j++) *coj++ = cmap[jj[j]];
7493             }
7494             ncoo_o += nz;
7495           } else { /* this is a local row */
7496             for (j = ii[i]; j < ii[i + 1]; j++) *idxown++ = j;
7497           }
7498         }
7499       }
7500       mmdata->off[cp + 1] = idxoff;
7501       mmdata->own[cp + 1] = idxown;
7502     }
7503 
7504     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)C), &mmdata->sf));
7505     PetscInt incoo_o;
7506     PetscCall(PetscIntCast(ncoo_o, &incoo_o));
7507     PetscCall(PetscSFSetGraphLayout(mmdata->sf, C->rmap, incoo_o /*nleaves*/, NULL /*ilocal*/, PETSC_OWN_POINTER, coo_i));
7508     PetscCall(PetscSFGetMultiSF(mmdata->sf, &msf));
7509     PetscCall(PetscSFGetGraph(msf, &ncoo2 /*nroots*/, NULL, NULL, NULL));
7510     ncoo = ncoo_d + ncoo_oown + ncoo2;
7511     PetscCall(PetscMalloc2(ncoo, &coo_i2, ncoo, &coo_j2));
7512     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_INT, coo_i, coo_i2 + ncoo_d + ncoo_oown)); /* put (i,j) of remote nonzeros at back */
7513     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_INT, coo_i, coo_i2 + ncoo_d + ncoo_oown));
7514     PetscCall(PetscSFGatherBegin(mmdata->sf, MPIU_INT, coo_j, coo_j2 + ncoo_d + ncoo_oown));
7515     PetscCall(PetscSFGatherEnd(mmdata->sf, MPIU_INT, coo_j, coo_j2 + ncoo_d + ncoo_oown));
7516     PetscCall(PetscFree2(coo_i, coo_j));
7517     /* allocate MPI send buffer to collect nonzero values to be sent to remote procs */
7518     PetscCall(PetscSFMalloc(mmdata->sf, mmdata->mtype, ncoo_o * sizeof(PetscScalar), (void **)&mmdata->coo_w));
7519     coo_i = coo_i2;
7520     coo_j = coo_j2;
7521   } else { /* no offproc values insertion */
7522     ncoo = ncoo_d;
7523     PetscCall(PetscMalloc2(ncoo, &coo_i, ncoo, &coo_j));
7524 
7525     PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)C), &mmdata->sf));
7526     PetscCall(PetscSFSetGraph(mmdata->sf, 0, 0, NULL, PETSC_OWN_POINTER, NULL, PETSC_OWN_POINTER));
7527     PetscCall(PetscSFSetUp(mmdata->sf));
7528   }
7529   mmdata->hasoffproc = hasoffproc;
7530 
7531   /* gather (i,j) of nonzeros inserted locally */
7532   for (cp = 0, ncoo_d = 0; cp < mmdata->cp; cp++) {
7533     Mat_SeqAIJ     *mm   = (Mat_SeqAIJ *)mp[cp]->data;
7534     PetscInt       *coi  = coo_i + ncoo_d;
7535     PetscInt       *coj  = coo_j + ncoo_d;
7536     const PetscInt *jj   = mm->j;
7537     const PetscInt *ii   = mm->i;
7538     const PetscInt *cmap = cmapa[cp];
7539     const PetscInt *rmap = rmapa[cp];
7540     const PetscInt  mr   = mp[cp]->rmap->n;
7541     const PetscInt  rs   = C->rmap->rstart;
7542     const PetscInt  re   = C->rmap->rend;
7543     const PetscInt  cs   = C->cmap->rstart;
7544 
7545     if (mptmp[cp]) continue;
7546     if (rmapt[cp] == 1) { /* consecutive rows */
7547       /* fill coo_i */
7548       for (i = 0; i < mr; i++) {
7549         const PetscInt gr = i + rs;
7550         for (j = ii[i]; j < ii[i + 1]; j++) coi[j] = gr;
7551       }
7552       /* fill coo_j */
7553       if (!cmapt[cp]) { /* type-0, already global */
7554         PetscCall(PetscArraycpy(coj, jj, mm->nz));
7555       } else if (cmapt[cp] == 1) {                        /* type-1, local to global for consecutive columns of C */
7556         for (j = 0; j < mm->nz; j++) coj[j] = jj[j] + cs; /* lid + col start */
7557       } else {                                            /* type-2, local to global for sparse columns */
7558         for (j = 0; j < mm->nz; j++) coj[j] = cmap[jj[j]];
7559       }
7560       ncoo_d += mm->nz;
7561     } else if (rmapt[cp] == 2) { /* sparse rows */
7562       for (i = 0; i < mr; i++) {
7563         const PetscInt *jj = mm->j + ii[i];
7564         const PetscInt  gr = rmap[i];
7565         const PetscInt  nz = ii[i + 1] - ii[i];
7566         if (gr >= rs && gr < re) { /* local rows */
7567           for (j = ii[i]; j < ii[i + 1]; j++) *coi++ = gr;
7568           if (!cmapt[cp]) { /* type-0, already global */
7569             for (j = 0; j < nz; j++) *coj++ = jj[j];
7570           } else if (cmapt[cp] == 1) { /* local to global for owned columns of C */
7571             for (j = 0; j < nz; j++) *coj++ = jj[j] + cs;
7572           } else { /* type-2, local to global for sparse columns */
7573             for (j = 0; j < nz; j++) *coj++ = cmap[jj[j]];
7574           }
7575           ncoo_d += nz;
7576         }
7577       }
7578     }
7579   }
7580   if (glob) PetscCall(ISRestoreIndices(glob, &globidx));
7581   PetscCall(ISDestroy(&glob));
7582   if (P_oth_l2g) PetscCall(ISLocalToGlobalMappingRestoreIndices(P_oth_l2g, &P_oth_idx));
7583   PetscCall(ISLocalToGlobalMappingDestroy(&P_oth_l2g));
7584   /* allocate an array to store all nonzeros (inserted locally or remotely) belonging to this proc */
7585   PetscCall(PetscSFMalloc(mmdata->sf, mmdata->mtype, ncoo * sizeof(PetscScalar), (void **)&mmdata->coo_v));
7586 
7587   /* set block sizes */
7588   A = product->A;
7589   P = product->B;
7590   switch (ptype) {
7591   case MATPRODUCT_PtAP:
7592     PetscCall(MatSetBlockSizes(C, P->cmap->bs, P->cmap->bs));
7593     break;
7594   case MATPRODUCT_RARt:
7595     PetscCall(MatSetBlockSizes(C, P->rmap->bs, P->rmap->bs));
7596     break;
7597   case MATPRODUCT_ABC:
7598     PetscCall(MatSetBlockSizesFromMats(C, A, product->C));
7599     break;
7600   case MATPRODUCT_AB:
7601     PetscCall(MatSetBlockSizesFromMats(C, A, P));
7602     break;
7603   case MATPRODUCT_AtB:
7604     PetscCall(MatSetBlockSizes(C, A->cmap->bs, P->cmap->bs));
7605     break;
7606   case MATPRODUCT_ABt:
7607     PetscCall(MatSetBlockSizes(C, A->rmap->bs, P->rmap->bs));
7608     break;
7609   default:
7610     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Not for ProductType %s", MatProductTypes[ptype]);
7611   }
7612 
7613   /* preallocate with COO data */
7614   PetscCall(MatSetPreallocationCOO(C, ncoo, coo_i, coo_j));
7615   PetscCall(PetscFree2(coo_i, coo_j));
7616   PetscFunctionReturn(PETSC_SUCCESS);
7617 }
7618 
7619 PetscErrorCode MatProductSetFromOptions_MPIAIJBACKEND(Mat mat)
7620 {
7621   Mat_Product *product = mat->product;
7622 #if defined(PETSC_HAVE_DEVICE)
7623   PetscBool match  = PETSC_FALSE;
7624   PetscBool usecpu = PETSC_FALSE;
7625 #else
7626   PetscBool match = PETSC_TRUE;
7627 #endif
7628 
7629   PetscFunctionBegin;
7630   MatCheckProduct(mat, 1);
7631 #if defined(PETSC_HAVE_DEVICE)
7632   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, ((PetscObject)product->A)->type_name, &match));
7633   if (match) { /* we can always fallback to the CPU if requested */
7634     switch (product->type) {
7635     case MATPRODUCT_AB:
7636       if (product->api_user) {
7637         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
7638         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
7639         PetscOptionsEnd();
7640       } else {
7641         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
7642         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
7643         PetscOptionsEnd();
7644       }
7645       break;
7646     case MATPRODUCT_AtB:
7647       if (product->api_user) {
7648         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
7649         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
7650         PetscOptionsEnd();
7651       } else {
7652         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
7653         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
7654         PetscOptionsEnd();
7655       }
7656       break;
7657     case MATPRODUCT_PtAP:
7658       if (product->api_user) {
7659         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
7660         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
7661         PetscOptionsEnd();
7662       } else {
7663         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
7664         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
7665         PetscOptionsEnd();
7666       }
7667       break;
7668     default:
7669       break;
7670     }
7671     match = (PetscBool)!usecpu;
7672   }
7673 #endif
7674   if (match) {
7675     switch (product->type) {
7676     case MATPRODUCT_AB:
7677     case MATPRODUCT_AtB:
7678     case MATPRODUCT_PtAP:
7679       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJBACKEND;
7680       break;
7681     default:
7682       break;
7683     }
7684   }
7685   /* fallback to MPIAIJ ops */
7686   if (!mat->ops->productsymbolic) PetscCall(MatProductSetFromOptions_MPIAIJ(mat));
7687   PetscFunctionReturn(PETSC_SUCCESS);
7688 }
7689 
7690 /*
7691    Produces a set of block column indices of the matrix row, one for each block represented in the original row
7692 
7693    n - the number of block indices in cc[]
7694    cc - the block indices (must be large enough to contain the indices)
7695 */
7696 static inline PetscErrorCode MatCollapseRow(Mat Amat, PetscInt row, PetscInt bs, PetscInt *n, PetscInt *cc)
7697 {
7698   PetscInt        cnt = -1, nidx, j;
7699   const PetscInt *idx;
7700 
7701   PetscFunctionBegin;
7702   PetscCall(MatGetRow(Amat, row, &nidx, &idx, NULL));
7703   if (nidx) {
7704     cnt     = 0;
7705     cc[cnt] = idx[0] / bs;
7706     for (j = 1; j < nidx; j++) {
7707       if (cc[cnt] < idx[j] / bs) cc[++cnt] = idx[j] / bs;
7708     }
7709   }
7710   PetscCall(MatRestoreRow(Amat, row, &nidx, &idx, NULL));
7711   *n = cnt + 1;
7712   PetscFunctionReturn(PETSC_SUCCESS);
7713 }
7714 
7715 /*
7716     Produces a set of block column indices of the matrix block row, one for each block represented in the original set of rows
7717 
7718     ncollapsed - the number of block indices
7719     collapsed - the block indices (must be large enough to contain the indices)
7720 */
7721 static inline PetscErrorCode MatCollapseRows(Mat Amat, PetscInt start, PetscInt bs, PetscInt *w0, PetscInt *w1, PetscInt *w2, PetscInt *ncollapsed, PetscInt **collapsed)
7722 {
7723   PetscInt i, nprev, *cprev = w0, ncur = 0, *ccur = w1, *merged = w2, *cprevtmp;
7724 
7725   PetscFunctionBegin;
7726   PetscCall(MatCollapseRow(Amat, start, bs, &nprev, cprev));
7727   for (i = start + 1; i < start + bs; i++) {
7728     PetscCall(MatCollapseRow(Amat, i, bs, &ncur, ccur));
7729     PetscCall(PetscMergeIntArray(nprev, cprev, ncur, ccur, &nprev, &merged));
7730     cprevtmp = cprev;
7731     cprev    = merged;
7732     merged   = cprevtmp;
7733   }
7734   *ncollapsed = nprev;
7735   if (collapsed) *collapsed = cprev;
7736   PetscFunctionReturn(PETSC_SUCCESS);
7737 }
7738 
7739 /*
7740  MatCreateGraph_Simple_AIJ - create simple scalar matrix (graph) from potentially blocked matrix
7741 
7742  Input Parameter:
7743  . Amat - matrix
7744  - symmetrize - make the result symmetric
7745  + scale - scale with diagonal
7746 
7747  Output Parameter:
7748  . a_Gmat - output scalar graph >= 0
7749 
7750 */
7751 PETSC_INTERN PetscErrorCode MatCreateGraph_Simple_AIJ(Mat Amat, PetscBool symmetrize, PetscBool scale, PetscReal filter, PetscInt index_size, PetscInt index[], Mat *a_Gmat)
7752 {
7753   PetscInt  Istart, Iend, Ii, jj, kk, ncols, nloc, NN, MM, bs;
7754   MPI_Comm  comm;
7755   Mat       Gmat;
7756   PetscBool ismpiaij, isseqaij;
7757   Mat       a, b, c;
7758   MatType   jtype;
7759 
7760   PetscFunctionBegin;
7761   PetscCall(PetscObjectGetComm((PetscObject)Amat, &comm));
7762   PetscCall(MatGetOwnershipRange(Amat, &Istart, &Iend));
7763   PetscCall(MatGetSize(Amat, &MM, &NN));
7764   PetscCall(MatGetBlockSize(Amat, &bs));
7765   nloc = (Iend - Istart) / bs;
7766 
7767   PetscCall(PetscObjectBaseTypeCompare((PetscObject)Amat, MATSEQAIJ, &isseqaij));
7768   PetscCall(PetscObjectBaseTypeCompare((PetscObject)Amat, MATMPIAIJ, &ismpiaij));
7769   PetscCheck(isseqaij || ismpiaij, comm, PETSC_ERR_USER, "Require (MPI)AIJ matrix type");
7770 
7771   /* TODO GPU: these calls are potentially expensive if matrices are large and we want to use the GPU */
7772   /* A solution consists in providing a new API, MatAIJGetCollapsedAIJ, and each class can provide a fast
7773      implementation */
7774   if (bs > 1) {
7775     PetscCall(MatGetType(Amat, &jtype));
7776     PetscCall(MatCreate(comm, &Gmat));
7777     PetscCall(MatSetType(Gmat, jtype));
7778     PetscCall(MatSetSizes(Gmat, nloc, nloc, PETSC_DETERMINE, PETSC_DETERMINE));
7779     PetscCall(MatSetBlockSizes(Gmat, 1, 1));
7780     if (isseqaij || ((Mat_MPIAIJ *)Amat->data)->garray) {
7781       PetscInt  *d_nnz, *o_nnz;
7782       MatScalar *aa, val, *AA;
7783       PetscInt  *aj, *ai, *AJ, nc, nmax = 0;
7784 
7785       if (isseqaij) {
7786         a = Amat;
7787         b = NULL;
7788       } else {
7789         Mat_MPIAIJ *d = (Mat_MPIAIJ *)Amat->data;
7790         a             = d->A;
7791         b             = d->B;
7792       }
7793       PetscCall(PetscInfo(Amat, "New bs>1 Graph. nloc=%" PetscInt_FMT "\n", nloc));
7794       PetscCall(PetscMalloc2(nloc, &d_nnz, (isseqaij ? 0 : nloc), &o_nnz));
7795       for (c = a, kk = 0; c && kk < 2; c = b, kk++) {
7796         PetscInt       *nnz = (c == a) ? d_nnz : o_nnz;
7797         const PetscInt *cols1, *cols2;
7798 
7799         for (PetscInt brow = 0, nc1, nc2, ok = 1; brow < nloc * bs; brow += bs) { // block rows
7800           PetscCall(MatGetRow(c, brow, &nc2, &cols2, NULL));
7801           nnz[brow / bs] = nc2 / bs;
7802           if (nc2 % bs) ok = 0;
7803           if (nnz[brow / bs] > nmax) nmax = nnz[brow / bs];
7804           for (PetscInt ii = 1; ii < bs; ii++) { // check for non-dense blocks
7805             PetscCall(MatGetRow(c, brow + ii, &nc1, &cols1, NULL));
7806             if (nc1 != nc2) ok = 0;
7807             else {
7808               for (PetscInt jj = 0; jj < nc1 && ok == 1; jj++) {
7809                 if (cols1[jj] != cols2[jj]) ok = 0;
7810                 if (cols1[jj] % bs != jj % bs) ok = 0;
7811               }
7812             }
7813             PetscCall(MatRestoreRow(c, brow + ii, &nc1, &cols1, NULL));
7814           }
7815           PetscCall(MatRestoreRow(c, brow, &nc2, &cols2, NULL));
7816           if (!ok) {
7817             PetscCall(PetscFree2(d_nnz, o_nnz));
7818             PetscCall(PetscInfo(Amat, "Found sparse blocks - revert to slow method\n"));
7819             goto old_bs;
7820           }
7821         }
7822       }
7823       PetscCall(MatSeqAIJSetPreallocation(Gmat, 0, d_nnz));
7824       PetscCall(MatMPIAIJSetPreallocation(Gmat, 0, d_nnz, 0, o_nnz));
7825       PetscCall(PetscFree2(d_nnz, o_nnz));
7826       PetscCall(PetscMalloc2(nmax, &AA, nmax, &AJ));
7827       // diag
7828       for (PetscInt brow = 0, n, grow; brow < nloc * bs; brow += bs) { // block rows
7829         Mat_SeqAIJ *aseq = (Mat_SeqAIJ *)a->data;
7830 
7831         ai = aseq->i;
7832         n  = ai[brow + 1] - ai[brow];
7833         aj = aseq->j + ai[brow];
7834         for (PetscInt k = 0; k < n; k += bs) {   // block columns
7835           AJ[k / bs] = aj[k] / bs + Istart / bs; // diag starts at (Istart,Istart)
7836           val        = 0;
7837           if (index_size == 0) {
7838             for (PetscInt ii = 0; ii < bs; ii++) { // rows in block
7839               aa = aseq->a + ai[brow + ii] + k;
7840               for (PetscInt jj = 0; jj < bs; jj++) {    // columns in block
7841                 val += PetscAbs(PetscRealPart(aa[jj])); // a sort of norm
7842               }
7843             }
7844           } else {                                            // use (index,index) value if provided
7845             for (PetscInt iii = 0; iii < index_size; iii++) { // rows in block
7846               PetscInt ii = index[iii];
7847               aa          = aseq->a + ai[brow + ii] + k;
7848               for (PetscInt jjj = 0; jjj < index_size; jjj++) { // columns in block
7849                 PetscInt jj = index[jjj];
7850                 val += PetscAbs(PetscRealPart(aa[jj]));
7851               }
7852             }
7853           }
7854           PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs (%" PetscInt_FMT ") >= nmax (%" PetscInt_FMT ")", k / bs, nmax);
7855           AA[k / bs] = val;
7856         }
7857         grow = Istart / bs + brow / bs;
7858         PetscCall(MatSetValues(Gmat, 1, &grow, n / bs, AJ, AA, ADD_VALUES));
7859       }
7860       // off-diag
7861       if (ismpiaij) {
7862         Mat_MPIAIJ        *aij = (Mat_MPIAIJ *)Amat->data;
7863         const PetscScalar *vals;
7864         const PetscInt    *cols, *garray = aij->garray;
7865 
7866         PetscCheck(garray, PETSC_COMM_SELF, PETSC_ERR_USER, "No garray ?");
7867         for (PetscInt brow = 0, grow; brow < nloc * bs; brow += bs) { // block rows
7868           PetscCall(MatGetRow(b, brow, &ncols, &cols, NULL));
7869           for (PetscInt k = 0, cidx = 0; k < ncols; k += bs, cidx++) {
7870             PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs >= nmax");
7871             AA[k / bs] = 0;
7872             AJ[cidx]   = garray[cols[k]] / bs;
7873           }
7874           nc = ncols / bs;
7875           PetscCall(MatRestoreRow(b, brow, &ncols, &cols, NULL));
7876           if (index_size == 0) {
7877             for (PetscInt ii = 0; ii < bs; ii++) { // rows in block
7878               PetscCall(MatGetRow(b, brow + ii, &ncols, &cols, &vals));
7879               for (PetscInt k = 0; k < ncols; k += bs) {
7880                 for (PetscInt jj = 0; jj < bs; jj++) { // cols in block
7881                   PetscAssert(k / bs < nmax, comm, PETSC_ERR_USER, "k / bs (%" PetscInt_FMT ") >= nmax (%" PetscInt_FMT ")", k / bs, nmax);
7882                   AA[k / bs] += PetscAbs(PetscRealPart(vals[k + jj]));
7883                 }
7884               }
7885               PetscCall(MatRestoreRow(b, brow + ii, &ncols, &cols, &vals));
7886             }
7887           } else {                                            // use (index,index) value if provided
7888             for (PetscInt iii = 0; iii < index_size; iii++) { // rows in block
7889               PetscInt ii = index[iii];
7890               PetscCall(MatGetRow(b, brow + ii, &ncols, &cols, &vals));
7891               for (PetscInt k = 0; k < ncols; k += bs) {
7892                 for (PetscInt jjj = 0; jjj < index_size; jjj++) { // cols in block
7893                   PetscInt jj = index[jjj];
7894                   AA[k / bs] += PetscAbs(PetscRealPart(vals[k + jj]));
7895                 }
7896               }
7897               PetscCall(MatRestoreRow(b, brow + ii, &ncols, &cols, &vals));
7898             }
7899           }
7900           grow = Istart / bs + brow / bs;
7901           PetscCall(MatSetValues(Gmat, 1, &grow, nc, AJ, AA, ADD_VALUES));
7902         }
7903       }
7904       PetscCall(MatAssemblyBegin(Gmat, MAT_FINAL_ASSEMBLY));
7905       PetscCall(MatAssemblyEnd(Gmat, MAT_FINAL_ASSEMBLY));
7906       PetscCall(PetscFree2(AA, AJ));
7907     } else {
7908       const PetscScalar *vals;
7909       const PetscInt    *idx;
7910       PetscInt          *d_nnz, *o_nnz, *w0, *w1, *w2;
7911     old_bs:
7912       /*
7913        Determine the preallocation needed for the scalar matrix derived from the vector matrix.
7914        */
7915       PetscCall(PetscInfo(Amat, "OLD bs>1 CreateGraph\n"));
7916       PetscCall(PetscMalloc2(nloc, &d_nnz, (isseqaij ? 0 : nloc), &o_nnz));
7917       if (isseqaij) {
7918         PetscInt max_d_nnz;
7919 
7920         /*
7921          Determine exact preallocation count for (sequential) scalar matrix
7922          */
7923         PetscCall(MatSeqAIJGetMaxRowNonzeros(Amat, &max_d_nnz));
7924         max_d_nnz = PetscMin(nloc, bs * max_d_nnz);
7925         PetscCall(PetscMalloc3(max_d_nnz, &w0, max_d_nnz, &w1, max_d_nnz, &w2));
7926         for (Ii = 0, jj = 0; Ii < Iend; Ii += bs, jj++) PetscCall(MatCollapseRows(Amat, Ii, bs, w0, w1, w2, &d_nnz[jj], NULL));
7927         PetscCall(PetscFree3(w0, w1, w2));
7928       } else if (ismpiaij) {
7929         Mat             Daij, Oaij;
7930         const PetscInt *garray;
7931         PetscInt        max_d_nnz;
7932 
7933         PetscCall(MatMPIAIJGetSeqAIJ(Amat, &Daij, &Oaij, &garray));
7934         /*
7935          Determine exact preallocation count for diagonal block portion of scalar matrix
7936          */
7937         PetscCall(MatSeqAIJGetMaxRowNonzeros(Daij, &max_d_nnz));
7938         max_d_nnz = PetscMin(nloc, bs * max_d_nnz);
7939         PetscCall(PetscMalloc3(max_d_nnz, &w0, max_d_nnz, &w1, max_d_nnz, &w2));
7940         for (Ii = 0, jj = 0; Ii < Iend - Istart; Ii += bs, jj++) PetscCall(MatCollapseRows(Daij, Ii, bs, w0, w1, w2, &d_nnz[jj], NULL));
7941         PetscCall(PetscFree3(w0, w1, w2));
7942         /*
7943          Over estimate (usually grossly over), preallocation count for off-diagonal portion of scalar matrix
7944          */
7945         for (Ii = 0, jj = 0; Ii < Iend - Istart; Ii += bs, jj++) {
7946           o_nnz[jj] = 0;
7947           for (kk = 0; kk < bs; kk++) { /* rows that get collapsed to a single row */
7948             PetscCall(MatGetRow(Oaij, Ii + kk, &ncols, NULL, NULL));
7949             o_nnz[jj] += ncols;
7950             PetscCall(MatRestoreRow(Oaij, Ii + kk, &ncols, NULL, NULL));
7951           }
7952           if (o_nnz[jj] > (NN / bs - nloc)) o_nnz[jj] = NN / bs - nloc;
7953         }
7954       } else SETERRQ(comm, PETSC_ERR_USER, "Require AIJ matrix type");
7955       /* get scalar copy (norms) of matrix */
7956       PetscCall(MatSeqAIJSetPreallocation(Gmat, 0, d_nnz));
7957       PetscCall(MatMPIAIJSetPreallocation(Gmat, 0, d_nnz, 0, o_nnz));
7958       PetscCall(PetscFree2(d_nnz, o_nnz));
7959       for (Ii = Istart; Ii < Iend; Ii++) {
7960         PetscInt dest_row = Ii / bs;
7961 
7962         PetscCall(MatGetRow(Amat, Ii, &ncols, &idx, &vals));
7963         for (jj = 0; jj < ncols; jj++) {
7964           PetscInt    dest_col = idx[jj] / bs;
7965           PetscScalar sv       = PetscAbs(PetscRealPart(vals[jj]));
7966 
7967           PetscCall(MatSetValues(Gmat, 1, &dest_row, 1, &dest_col, &sv, ADD_VALUES));
7968         }
7969         PetscCall(MatRestoreRow(Amat, Ii, &ncols, &idx, &vals));
7970       }
7971       PetscCall(MatAssemblyBegin(Gmat, MAT_FINAL_ASSEMBLY));
7972       PetscCall(MatAssemblyEnd(Gmat, MAT_FINAL_ASSEMBLY));
7973     }
7974   } else {
7975     if (symmetrize || filter >= 0 || scale) PetscCall(MatDuplicate(Amat, MAT_COPY_VALUES, &Gmat));
7976     else {
7977       Gmat = Amat;
7978       PetscCall(PetscObjectReference((PetscObject)Gmat));
7979     }
7980     if (isseqaij) {
7981       a = Gmat;
7982       b = NULL;
7983     } else {
7984       Mat_MPIAIJ *d = (Mat_MPIAIJ *)Gmat->data;
7985       a             = d->A;
7986       b             = d->B;
7987     }
7988     if (filter >= 0 || scale) {
7989       /* take absolute value of each entry */
7990       for (c = a, kk = 0; c && kk < 2; c = b, kk++) {
7991         MatInfo      info;
7992         PetscScalar *avals;
7993 
7994         PetscCall(MatGetInfo(c, MAT_LOCAL, &info));
7995         PetscCall(MatSeqAIJGetArray(c, &avals));
7996         for (int jj = 0; jj < info.nz_used; jj++) avals[jj] = PetscAbsScalar(avals[jj]);
7997         PetscCall(MatSeqAIJRestoreArray(c, &avals));
7998       }
7999     }
8000   }
8001   if (symmetrize) {
8002     PetscBool isset, issym;
8003 
8004     PetscCall(MatIsSymmetricKnown(Amat, &isset, &issym));
8005     if (!isset || !issym) {
8006       Mat matTrans;
8007 
8008       PetscCall(MatTranspose(Gmat, MAT_INITIAL_MATRIX, &matTrans));
8009       PetscCall(MatAXPY(Gmat, 1.0, matTrans, Gmat->structurally_symmetric == PETSC_BOOL3_TRUE ? SAME_NONZERO_PATTERN : DIFFERENT_NONZERO_PATTERN));
8010       PetscCall(MatDestroy(&matTrans));
8011     }
8012     PetscCall(MatSetOption(Gmat, MAT_SYMMETRIC, PETSC_TRUE));
8013   } else if (Amat != Gmat) PetscCall(MatPropagateSymmetryOptions(Amat, Gmat));
8014   if (scale) {
8015     /* scale c for all diagonal values = 1 or -1 */
8016     Vec diag;
8017 
8018     PetscCall(MatCreateVecs(Gmat, &diag, NULL));
8019     PetscCall(MatGetDiagonal(Gmat, diag));
8020     PetscCall(VecReciprocal(diag));
8021     PetscCall(VecSqrtAbs(diag));
8022     PetscCall(MatDiagonalScale(Gmat, diag, diag));
8023     PetscCall(VecDestroy(&diag));
8024   }
8025   PetscCall(MatViewFromOptions(Gmat, NULL, "-mat_graph_view"));
8026   if (filter >= 0) {
8027     PetscCall(MatFilter(Gmat, filter, PETSC_TRUE, PETSC_TRUE));
8028     PetscCall(MatViewFromOptions(Gmat, NULL, "-mat_filter_graph_view"));
8029   }
8030   *a_Gmat = Gmat;
8031   PetscFunctionReturn(PETSC_SUCCESS);
8032 }
8033 
8034 /*
8035     Special version for direct calls from Fortran
8036 */
8037 
8038 /* Change these macros so can be used in void function */
8039 /* Identical to PetscCallVoid, except it assigns to *_ierr */
8040 #undef PetscCall
8041 #define PetscCall(...) \
8042   do { \
8043     PetscErrorCode ierr_msv_mpiaij = __VA_ARGS__; \
8044     if (PetscUnlikely(ierr_msv_mpiaij)) { \
8045       *_ierr = PetscError(PETSC_COMM_SELF, __LINE__, PETSC_FUNCTION_NAME, __FILE__, ierr_msv_mpiaij, PETSC_ERROR_REPEAT, " "); \
8046       return; \
8047     } \
8048   } while (0)
8049 
8050 #undef SETERRQ
8051 #define SETERRQ(comm, ierr, ...) \
8052   do { \
8053     *_ierr = PetscError(comm, __LINE__, PETSC_FUNCTION_NAME, __FILE__, ierr, PETSC_ERROR_INITIAL, __VA_ARGS__); \
8054     return; \
8055   } while (0)
8056 
8057 #if defined(PETSC_HAVE_FORTRAN_CAPS)
8058   #define matsetvaluesmpiaij_ MATSETVALUESMPIAIJ
8059 #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
8060   #define matsetvaluesmpiaij_ matsetvaluesmpiaij
8061 #else
8062 #endif
8063 PETSC_EXTERN void matsetvaluesmpiaij_(Mat *mmat, PetscInt *mm, const PetscInt im[], PetscInt *mn, const PetscInt in[], const PetscScalar v[], InsertMode *maddv, PetscErrorCode *_ierr)
8064 {
8065   Mat         mat = *mmat;
8066   PetscInt    m = *mm, n = *mn;
8067   InsertMode  addv = *maddv;
8068   Mat_MPIAIJ *aij  = (Mat_MPIAIJ *)mat->data;
8069   PetscScalar value;
8070 
8071   MatCheckPreallocated(mat, 1);
8072   if (mat->insertmode == NOT_SET_VALUES) mat->insertmode = addv;
8073   else PetscCheck(mat->insertmode == addv, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Cannot mix add values and insert values");
8074   {
8075     PetscInt  i, j, rstart = mat->rmap->rstart, rend = mat->rmap->rend;
8076     PetscInt  cstart = mat->cmap->rstart, cend = mat->cmap->rend, row, col;
8077     PetscBool roworiented = aij->roworiented;
8078 
8079     /* Some Variables required in the macro */
8080     Mat         A     = aij->A;
8081     Mat_SeqAIJ *a     = (Mat_SeqAIJ *)A->data;
8082     PetscInt   *aimax = a->imax, *ai = a->i, *ailen = a->ilen, *aj = a->j;
8083     MatScalar  *aa;
8084     PetscBool   ignorezeroentries = ((a->ignorezeroentries && (addv == ADD_VALUES)) ? PETSC_TRUE : PETSC_FALSE);
8085     Mat         B                 = aij->B;
8086     Mat_SeqAIJ *b                 = (Mat_SeqAIJ *)B->data;
8087     PetscInt   *bimax = b->imax, *bi = b->i, *bilen = b->ilen, *bj = b->j, bm = aij->B->rmap->n, am = aij->A->rmap->n;
8088     MatScalar  *ba;
8089     /* This variable below is only for the PETSC_HAVE_VIENNACL or PETSC_HAVE_CUDA cases, but we define it in all cases because we
8090      * cannot use "#if defined" inside a macro. */
8091     PETSC_UNUSED PetscBool inserted = PETSC_FALSE;
8092 
8093     PetscInt  *rp1, *rp2, ii, nrow1, nrow2, _i, rmax1, rmax2, N, low1, high1, low2, high2, t, lastcol1, lastcol2;
8094     PetscInt   nonew = a->nonew;
8095     MatScalar *ap1, *ap2;
8096 
8097     PetscFunctionBegin;
8098     PetscCall(MatSeqAIJGetArray(A, &aa));
8099     PetscCall(MatSeqAIJGetArray(B, &ba));
8100     for (i = 0; i < m; i++) {
8101       if (im[i] < 0) continue;
8102       PetscCheck(im[i] < mat->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, im[i], mat->rmap->N - 1);
8103       if (im[i] >= rstart && im[i] < rend) {
8104         row      = im[i] - rstart;
8105         lastcol1 = -1;
8106         rp1      = aj + ai[row];
8107         ap1      = aa + ai[row];
8108         rmax1    = aimax[row];
8109         nrow1    = ailen[row];
8110         low1     = 0;
8111         high1    = nrow1;
8112         lastcol2 = -1;
8113         rp2      = bj + bi[row];
8114         ap2      = ba + bi[row];
8115         rmax2    = bimax[row];
8116         nrow2    = bilen[row];
8117         low2     = 0;
8118         high2    = nrow2;
8119 
8120         for (j = 0; j < n; j++) {
8121           if (roworiented) value = v[i * n + j];
8122           else value = v[i + j * m];
8123           if (ignorezeroentries && value == 0.0 && (addv == ADD_VALUES) && im[i] != in[j]) continue;
8124           if (in[j] >= cstart && in[j] < cend) {
8125             col = in[j] - cstart;
8126             MatSetValues_SeqAIJ_A_Private(row, col, value, addv, im[i], in[j]);
8127           } else if (in[j] < 0) continue;
8128           else if (PetscUnlikelyDebug(in[j] >= mat->cmap->N)) {
8129             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[j], mat->cmap->N - 1);
8130           } else {
8131             if (mat->was_assembled) {
8132               if (!aij->colmap) PetscCall(MatCreateColmap_MPIAIJ_Private(mat));
8133 #if defined(PETSC_USE_CTABLE)
8134               PetscCall(PetscHMapIGetWithDefault(aij->colmap, in[j] + 1, 0, &col));
8135               col--;
8136 #else
8137               col = aij->colmap[in[j]] - 1;
8138 #endif
8139               if (col < 0 && !((Mat_SeqAIJ *)aij->A->data)->nonew) {
8140                 PetscCall(MatDisAssemble_MPIAIJ(mat, PETSC_FALSE));
8141                 col = in[j];
8142                 /* Reinitialize the variables required by MatSetValues_SeqAIJ_B_Private() */
8143                 B        = aij->B;
8144                 b        = (Mat_SeqAIJ *)B->data;
8145                 bimax    = b->imax;
8146                 bi       = b->i;
8147                 bilen    = b->ilen;
8148                 bj       = b->j;
8149                 rp2      = bj + bi[row];
8150                 ap2      = ba + bi[row];
8151                 rmax2    = bimax[row];
8152                 nrow2    = bilen[row];
8153                 low2     = 0;
8154                 high2    = nrow2;
8155                 bm       = aij->B->rmap->n;
8156                 ba       = b->a;
8157                 inserted = PETSC_FALSE;
8158               }
8159             } else col = in[j];
8160             MatSetValues_SeqAIJ_B_Private(row, col, value, addv, im[i], in[j]);
8161           }
8162         }
8163       } else if (!aij->donotstash) {
8164         if (roworiented) {
8165           PetscCall(MatStashValuesRow_Private(&mat->stash, im[i], n, in, v + i * n, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
8166         } else {
8167           PetscCall(MatStashValuesCol_Private(&mat->stash, im[i], n, in, v + i, m, (PetscBool)(ignorezeroentries && (addv == ADD_VALUES))));
8168         }
8169       }
8170     }
8171     PetscCall(MatSeqAIJRestoreArray(A, &aa));
8172     PetscCall(MatSeqAIJRestoreArray(B, &ba));
8173   }
8174   PetscFunctionReturnVoid();
8175 }
8176 
8177 /* Undefining these here since they were redefined from their original definition above! No
8178  * other PETSc functions should be defined past this point, as it is impossible to recover the
8179  * original definitions */
8180 #undef PetscCall
8181 #undef SETERRQ
8182