xref: /petsc/src/mat/impls/baij/seq/baij.c (revision d5b43468fb8780a8feea140ccd6fa3e6a50411cc)
1 
2 /*
3     Defines the basic matrix operations for the BAIJ (compressed row)
4   matrix storage format.
5 */
6 #include <../src/mat/impls/baij/seq/baij.h> /*I   "petscmat.h"  I*/
7 #include <petscblaslapack.h>
8 #include <petsc/private/kernels/blockinvert.h>
9 #include <petsc/private/kernels/blockmatmult.h>
10 
11 #if defined(PETSC_HAVE_HYPRE)
12 PETSC_INTERN PetscErrorCode MatConvert_AIJ_HYPRE(Mat, MatType, MatReuse, Mat *);
13 #endif
14 
15 #if defined(PETSC_HAVE_MKL_SPARSE_OPTIMIZE)
16 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBAIJMKL(Mat, MatType, MatReuse, Mat *);
17 #endif
18 PETSC_INTERN PetscErrorCode MatConvert_XAIJ_IS(Mat, MatType, MatReuse, Mat *);
19 
20 PetscErrorCode MatGetColumnReductions_SeqBAIJ(Mat A, PetscInt type, PetscReal *reductions)
21 {
22   Mat_SeqBAIJ *a_aij = (Mat_SeqBAIJ *)A->data;
23   PetscInt     m, n, i;
24   PetscInt     ib, jb, bs = A->rmap->bs;
25   MatScalar   *a_val = a_aij->a;
26 
27   PetscFunctionBegin;
28   PetscCall(MatGetSize(A, &m, &n));
29   for (i = 0; i < n; i++) reductions[i] = 0.0;
30   if (type == NORM_2) {
31     for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
32       for (jb = 0; jb < bs; jb++) {
33         for (ib = 0; ib < bs; ib++) {
34           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscAbsScalar(*a_val * *a_val);
35           a_val++;
36         }
37       }
38     }
39   } else if (type == NORM_1) {
40     for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
41       for (jb = 0; jb < bs; jb++) {
42         for (ib = 0; ib < bs; ib++) {
43           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscAbsScalar(*a_val);
44           a_val++;
45         }
46       }
47     }
48   } else if (type == NORM_INFINITY) {
49     for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
50       for (jb = 0; jb < bs; jb++) {
51         for (ib = 0; ib < bs; ib++) {
52           int col         = A->cmap->rstart + a_aij->j[i] * bs + jb;
53           reductions[col] = PetscMax(PetscAbsScalar(*a_val), reductions[col]);
54           a_val++;
55         }
56       }
57     }
58   } else if (type == REDUCTION_SUM_REALPART || type == REDUCTION_MEAN_REALPART) {
59     for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
60       for (jb = 0; jb < bs; jb++) {
61         for (ib = 0; ib < bs; ib++) {
62           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscRealPart(*a_val);
63           a_val++;
64         }
65       }
66     }
67   } else if (type == REDUCTION_SUM_IMAGINARYPART || type == REDUCTION_MEAN_IMAGINARYPART) {
68     for (i = a_aij->i[0]; i < a_aij->i[A->rmap->n / bs]; i++) {
69       for (jb = 0; jb < bs; jb++) {
70         for (ib = 0; ib < bs; ib++) {
71           reductions[A->cmap->rstart + a_aij->j[i] * bs + jb] += PetscImaginaryPart(*a_val);
72           a_val++;
73         }
74       }
75     }
76   } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Unknown reduction type");
77   if (type == NORM_2) {
78     for (i = 0; i < n; i++) reductions[i] = PetscSqrtReal(reductions[i]);
79   } else if (type == REDUCTION_MEAN_REALPART || type == REDUCTION_MEAN_IMAGINARYPART) {
80     for (i = 0; i < n; i++) reductions[i] /= m;
81   }
82   PetscFunctionReturn(0);
83 }
84 
85 PetscErrorCode MatInvertBlockDiagonal_SeqBAIJ(Mat A, const PetscScalar **values)
86 {
87   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
88   PetscInt    *diag_offset, i, bs = A->rmap->bs, mbs = a->mbs, ipvt[5], bs2 = bs * bs, *v_pivots;
89   MatScalar   *v     = a->a, *odiag, *diag, work[25], *v_work;
90   PetscReal    shift = 0.0;
91   PetscBool    allowzeropivot, zeropivotdetected = PETSC_FALSE;
92 
93   PetscFunctionBegin;
94   allowzeropivot = PetscNot(A->erroriffailure);
95 
96   if (a->idiagvalid) {
97     if (values) *values = a->idiag;
98     PetscFunctionReturn(0);
99   }
100   PetscCall(MatMarkDiagonal_SeqBAIJ(A));
101   diag_offset = a->diag;
102   if (!a->idiag) { PetscCall(PetscMalloc1(bs2 * mbs, &a->idiag)); }
103   diag = a->idiag;
104   if (values) *values = a->idiag;
105   /* factor and invert each block */
106   switch (bs) {
107   case 1:
108     for (i = 0; i < mbs; i++) {
109       odiag   = v + 1 * diag_offset[i];
110       diag[0] = odiag[0];
111 
112       if (PetscAbsScalar(diag[0] + shift) < PETSC_MACHINE_EPSILON) {
113         if (allowzeropivot) {
114           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
115           A->factorerror_zeropivot_value = PetscAbsScalar(diag[0]);
116           A->factorerror_zeropivot_row   = i;
117           PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", i));
118         } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot, row %" PetscInt_FMT " pivot value %g tolerance %g", i, (double)PetscAbsScalar(diag[0]), (double)PETSC_MACHINE_EPSILON);
119       }
120 
121       diag[0] = (PetscScalar)1.0 / (diag[0] + shift);
122       diag += 1;
123     }
124     break;
125   case 2:
126     for (i = 0; i < mbs; i++) {
127       odiag   = v + 4 * diag_offset[i];
128       diag[0] = odiag[0];
129       diag[1] = odiag[1];
130       diag[2] = odiag[2];
131       diag[3] = odiag[3];
132       PetscCall(PetscKernel_A_gets_inverse_A_2(diag, shift, allowzeropivot, &zeropivotdetected));
133       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
134       diag += 4;
135     }
136     break;
137   case 3:
138     for (i = 0; i < mbs; i++) {
139       odiag   = v + 9 * diag_offset[i];
140       diag[0] = odiag[0];
141       diag[1] = odiag[1];
142       diag[2] = odiag[2];
143       diag[3] = odiag[3];
144       diag[4] = odiag[4];
145       diag[5] = odiag[5];
146       diag[6] = odiag[6];
147       diag[7] = odiag[7];
148       diag[8] = odiag[8];
149       PetscCall(PetscKernel_A_gets_inverse_A_3(diag, shift, allowzeropivot, &zeropivotdetected));
150       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
151       diag += 9;
152     }
153     break;
154   case 4:
155     for (i = 0; i < mbs; i++) {
156       odiag = v + 16 * diag_offset[i];
157       PetscCall(PetscArraycpy(diag, odiag, 16));
158       PetscCall(PetscKernel_A_gets_inverse_A_4(diag, shift, allowzeropivot, &zeropivotdetected));
159       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
160       diag += 16;
161     }
162     break;
163   case 5:
164     for (i = 0; i < mbs; i++) {
165       odiag = v + 25 * diag_offset[i];
166       PetscCall(PetscArraycpy(diag, odiag, 25));
167       PetscCall(PetscKernel_A_gets_inverse_A_5(diag, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
168       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
169       diag += 25;
170     }
171     break;
172   case 6:
173     for (i = 0; i < mbs; i++) {
174       odiag = v + 36 * diag_offset[i];
175       PetscCall(PetscArraycpy(diag, odiag, 36));
176       PetscCall(PetscKernel_A_gets_inverse_A_6(diag, shift, allowzeropivot, &zeropivotdetected));
177       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
178       diag += 36;
179     }
180     break;
181   case 7:
182     for (i = 0; i < mbs; i++) {
183       odiag = v + 49 * diag_offset[i];
184       PetscCall(PetscArraycpy(diag, odiag, 49));
185       PetscCall(PetscKernel_A_gets_inverse_A_7(diag, shift, allowzeropivot, &zeropivotdetected));
186       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
187       diag += 49;
188     }
189     break;
190   default:
191     PetscCall(PetscMalloc2(bs, &v_work, bs, &v_pivots));
192     for (i = 0; i < mbs; i++) {
193       odiag = v + bs2 * diag_offset[i];
194       PetscCall(PetscArraycpy(diag, odiag, bs2));
195       PetscCall(PetscKernel_A_gets_inverse_A(bs, diag, v_pivots, v_work, allowzeropivot, &zeropivotdetected));
196       if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
197       diag += bs2;
198     }
199     PetscCall(PetscFree2(v_work, v_pivots));
200   }
201   a->idiagvalid = PETSC_TRUE;
202   PetscFunctionReturn(0);
203 }
204 
205 PetscErrorCode MatSOR_SeqBAIJ(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
206 {
207   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
208   PetscScalar       *x, *work, *w, *workt, *t;
209   const MatScalar   *v, *aa = a->a, *idiag;
210   const PetscScalar *b, *xb;
211   PetscScalar        s[7], xw[7] = {0}; /* avoid some compilers thinking xw is uninitialized */
212   PetscInt           m = a->mbs, i, i2, nz, bs = A->rmap->bs, bs2 = bs * bs, k, j, idx, it;
213   const PetscInt    *diag, *ai = a->i, *aj = a->j, *vi;
214 
215   PetscFunctionBegin;
216   its = its * lits;
217   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
218   PetscCheck(its > 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Relaxation requires global its %" PetscInt_FMT " and local its %" PetscInt_FMT " both positive", its, lits);
219   PetscCheck(!fshift, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for diagonal shift");
220   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for non-trivial relaxation factor");
221   PetscCheck(!(flag & SOR_APPLY_UPPER) && !(flag & SOR_APPLY_LOWER), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for applying upper or lower triangular parts");
222 
223   if (!a->idiagvalid) PetscCall(MatInvertBlockDiagonal(A, NULL));
224 
225   if (!m) PetscFunctionReturn(0);
226   diag  = a->diag;
227   idiag = a->idiag;
228   k     = PetscMax(A->rmap->n, A->cmap->n);
229   if (!a->mult_work) PetscCall(PetscMalloc1(k + 1, &a->mult_work));
230   if (!a->sor_workt) PetscCall(PetscMalloc1(k, &a->sor_workt));
231   if (!a->sor_work) PetscCall(PetscMalloc1(bs, &a->sor_work));
232   work = a->mult_work;
233   t    = a->sor_workt;
234   w    = a->sor_work;
235 
236   PetscCall(VecGetArray(xx, &x));
237   PetscCall(VecGetArrayRead(bb, &b));
238 
239   if (flag & SOR_ZERO_INITIAL_GUESS) {
240     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
241       switch (bs) {
242       case 1:
243         PetscKernel_v_gets_A_times_w_1(x, idiag, b);
244         t[0] = b[0];
245         i2   = 1;
246         idiag += 1;
247         for (i = 1; i < m; i++) {
248           v    = aa + ai[i];
249           vi   = aj + ai[i];
250           nz   = diag[i] - ai[i];
251           s[0] = b[i2];
252           for (j = 0; j < nz; j++) {
253             xw[0] = x[vi[j]];
254             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
255           }
256           t[i2] = s[0];
257           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
258           x[i2] = xw[0];
259           idiag += 1;
260           i2 += 1;
261         }
262         break;
263       case 2:
264         PetscKernel_v_gets_A_times_w_2(x, idiag, b);
265         t[0] = b[0];
266         t[1] = b[1];
267         i2   = 2;
268         idiag += 4;
269         for (i = 1; i < m; i++) {
270           v    = aa + 4 * ai[i];
271           vi   = aj + ai[i];
272           nz   = diag[i] - ai[i];
273           s[0] = b[i2];
274           s[1] = b[i2 + 1];
275           for (j = 0; j < nz; j++) {
276             idx   = 2 * vi[j];
277             it    = 4 * j;
278             xw[0] = x[idx];
279             xw[1] = x[1 + idx];
280             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
281           }
282           t[i2]     = s[0];
283           t[i2 + 1] = s[1];
284           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
285           x[i2]     = xw[0];
286           x[i2 + 1] = xw[1];
287           idiag += 4;
288           i2 += 2;
289         }
290         break;
291       case 3:
292         PetscKernel_v_gets_A_times_w_3(x, idiag, b);
293         t[0] = b[0];
294         t[1] = b[1];
295         t[2] = b[2];
296         i2   = 3;
297         idiag += 9;
298         for (i = 1; i < m; i++) {
299           v    = aa + 9 * ai[i];
300           vi   = aj + ai[i];
301           nz   = diag[i] - ai[i];
302           s[0] = b[i2];
303           s[1] = b[i2 + 1];
304           s[2] = b[i2 + 2];
305           while (nz--) {
306             idx   = 3 * (*vi++);
307             xw[0] = x[idx];
308             xw[1] = x[1 + idx];
309             xw[2] = x[2 + idx];
310             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
311             v += 9;
312           }
313           t[i2]     = s[0];
314           t[i2 + 1] = s[1];
315           t[i2 + 2] = s[2];
316           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
317           x[i2]     = xw[0];
318           x[i2 + 1] = xw[1];
319           x[i2 + 2] = xw[2];
320           idiag += 9;
321           i2 += 3;
322         }
323         break;
324       case 4:
325         PetscKernel_v_gets_A_times_w_4(x, idiag, b);
326         t[0] = b[0];
327         t[1] = b[1];
328         t[2] = b[2];
329         t[3] = b[3];
330         i2   = 4;
331         idiag += 16;
332         for (i = 1; i < m; i++) {
333           v    = aa + 16 * ai[i];
334           vi   = aj + ai[i];
335           nz   = diag[i] - ai[i];
336           s[0] = b[i2];
337           s[1] = b[i2 + 1];
338           s[2] = b[i2 + 2];
339           s[3] = b[i2 + 3];
340           while (nz--) {
341             idx   = 4 * (*vi++);
342             xw[0] = x[idx];
343             xw[1] = x[1 + idx];
344             xw[2] = x[2 + idx];
345             xw[3] = x[3 + idx];
346             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
347             v += 16;
348           }
349           t[i2]     = s[0];
350           t[i2 + 1] = s[1];
351           t[i2 + 2] = s[2];
352           t[i2 + 3] = s[3];
353           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
354           x[i2]     = xw[0];
355           x[i2 + 1] = xw[1];
356           x[i2 + 2] = xw[2];
357           x[i2 + 3] = xw[3];
358           idiag += 16;
359           i2 += 4;
360         }
361         break;
362       case 5:
363         PetscKernel_v_gets_A_times_w_5(x, idiag, b);
364         t[0] = b[0];
365         t[1] = b[1];
366         t[2] = b[2];
367         t[3] = b[3];
368         t[4] = b[4];
369         i2   = 5;
370         idiag += 25;
371         for (i = 1; i < m; i++) {
372           v    = aa + 25 * ai[i];
373           vi   = aj + ai[i];
374           nz   = diag[i] - ai[i];
375           s[0] = b[i2];
376           s[1] = b[i2 + 1];
377           s[2] = b[i2 + 2];
378           s[3] = b[i2 + 3];
379           s[4] = b[i2 + 4];
380           while (nz--) {
381             idx   = 5 * (*vi++);
382             xw[0] = x[idx];
383             xw[1] = x[1 + idx];
384             xw[2] = x[2 + idx];
385             xw[3] = x[3 + idx];
386             xw[4] = x[4 + idx];
387             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
388             v += 25;
389           }
390           t[i2]     = s[0];
391           t[i2 + 1] = s[1];
392           t[i2 + 2] = s[2];
393           t[i2 + 3] = s[3];
394           t[i2 + 4] = s[4];
395           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
396           x[i2]     = xw[0];
397           x[i2 + 1] = xw[1];
398           x[i2 + 2] = xw[2];
399           x[i2 + 3] = xw[3];
400           x[i2 + 4] = xw[4];
401           idiag += 25;
402           i2 += 5;
403         }
404         break;
405       case 6:
406         PetscKernel_v_gets_A_times_w_6(x, idiag, b);
407         t[0] = b[0];
408         t[1] = b[1];
409         t[2] = b[2];
410         t[3] = b[3];
411         t[4] = b[4];
412         t[5] = b[5];
413         i2   = 6;
414         idiag += 36;
415         for (i = 1; i < m; i++) {
416           v    = aa + 36 * ai[i];
417           vi   = aj + ai[i];
418           nz   = diag[i] - ai[i];
419           s[0] = b[i2];
420           s[1] = b[i2 + 1];
421           s[2] = b[i2 + 2];
422           s[3] = b[i2 + 3];
423           s[4] = b[i2 + 4];
424           s[5] = b[i2 + 5];
425           while (nz--) {
426             idx   = 6 * (*vi++);
427             xw[0] = x[idx];
428             xw[1] = x[1 + idx];
429             xw[2] = x[2 + idx];
430             xw[3] = x[3 + idx];
431             xw[4] = x[4 + idx];
432             xw[5] = x[5 + idx];
433             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
434             v += 36;
435           }
436           t[i2]     = s[0];
437           t[i2 + 1] = s[1];
438           t[i2 + 2] = s[2];
439           t[i2 + 3] = s[3];
440           t[i2 + 4] = s[4];
441           t[i2 + 5] = s[5];
442           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
443           x[i2]     = xw[0];
444           x[i2 + 1] = xw[1];
445           x[i2 + 2] = xw[2];
446           x[i2 + 3] = xw[3];
447           x[i2 + 4] = xw[4];
448           x[i2 + 5] = xw[5];
449           idiag += 36;
450           i2 += 6;
451         }
452         break;
453       case 7:
454         PetscKernel_v_gets_A_times_w_7(x, idiag, b);
455         t[0] = b[0];
456         t[1] = b[1];
457         t[2] = b[2];
458         t[3] = b[3];
459         t[4] = b[4];
460         t[5] = b[5];
461         t[6] = b[6];
462         i2   = 7;
463         idiag += 49;
464         for (i = 1; i < m; i++) {
465           v    = aa + 49 * ai[i];
466           vi   = aj + ai[i];
467           nz   = diag[i] - ai[i];
468           s[0] = b[i2];
469           s[1] = b[i2 + 1];
470           s[2] = b[i2 + 2];
471           s[3] = b[i2 + 3];
472           s[4] = b[i2 + 4];
473           s[5] = b[i2 + 5];
474           s[6] = b[i2 + 6];
475           while (nz--) {
476             idx   = 7 * (*vi++);
477             xw[0] = x[idx];
478             xw[1] = x[1 + idx];
479             xw[2] = x[2 + idx];
480             xw[3] = x[3 + idx];
481             xw[4] = x[4 + idx];
482             xw[5] = x[5 + idx];
483             xw[6] = x[6 + idx];
484             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
485             v += 49;
486           }
487           t[i2]     = s[0];
488           t[i2 + 1] = s[1];
489           t[i2 + 2] = s[2];
490           t[i2 + 3] = s[3];
491           t[i2 + 4] = s[4];
492           t[i2 + 5] = s[5];
493           t[i2 + 6] = s[6];
494           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
495           x[i2]     = xw[0];
496           x[i2 + 1] = xw[1];
497           x[i2 + 2] = xw[2];
498           x[i2 + 3] = xw[3];
499           x[i2 + 4] = xw[4];
500           x[i2 + 5] = xw[5];
501           x[i2 + 6] = xw[6];
502           idiag += 49;
503           i2 += 7;
504         }
505         break;
506       default:
507         PetscKernel_w_gets_Ar_times_v(bs, bs, b, idiag, x);
508         PetscCall(PetscArraycpy(t, b, bs));
509         i2 = bs;
510         idiag += bs2;
511         for (i = 1; i < m; i++) {
512           v  = aa + bs2 * ai[i];
513           vi = aj + ai[i];
514           nz = diag[i] - ai[i];
515 
516           PetscCall(PetscArraycpy(w, b + i2, bs));
517           /* copy all rows of x that are needed into contiguous space */
518           workt = work;
519           for (j = 0; j < nz; j++) {
520             PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs));
521             workt += bs;
522           }
523           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
524           PetscCall(PetscArraycpy(t + i2, w, bs));
525           PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2);
526 
527           idiag += bs2;
528           i2 += bs;
529         }
530         break;
531       }
532       /* for logging purposes assume number of nonzero in lower half is 1/2 of total */
533       PetscCall(PetscLogFlops(1.0 * bs2 * a->nz));
534       xb = t;
535     } else xb = b;
536     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
537       idiag = a->idiag + bs2 * (a->mbs - 1);
538       i2    = bs * (m - 1);
539       switch (bs) {
540       case 1:
541         s[0] = xb[i2];
542         PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
543         x[i2] = xw[0];
544         i2 -= 1;
545         for (i = m - 2; i >= 0; i--) {
546           v    = aa + (diag[i] + 1);
547           vi   = aj + diag[i] + 1;
548           nz   = ai[i + 1] - diag[i] - 1;
549           s[0] = xb[i2];
550           for (j = 0; j < nz; j++) {
551             xw[0] = x[vi[j]];
552             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
553           }
554           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
555           x[i2] = xw[0];
556           idiag -= 1;
557           i2 -= 1;
558         }
559         break;
560       case 2:
561         s[0] = xb[i2];
562         s[1] = xb[i2 + 1];
563         PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
564         x[i2]     = xw[0];
565         x[i2 + 1] = xw[1];
566         i2 -= 2;
567         idiag -= 4;
568         for (i = m - 2; i >= 0; i--) {
569           v    = aa + 4 * (diag[i] + 1);
570           vi   = aj + diag[i] + 1;
571           nz   = ai[i + 1] - diag[i] - 1;
572           s[0] = xb[i2];
573           s[1] = xb[i2 + 1];
574           for (j = 0; j < nz; j++) {
575             idx   = 2 * vi[j];
576             it    = 4 * j;
577             xw[0] = x[idx];
578             xw[1] = x[1 + idx];
579             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
580           }
581           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
582           x[i2]     = xw[0];
583           x[i2 + 1] = xw[1];
584           idiag -= 4;
585           i2 -= 2;
586         }
587         break;
588       case 3:
589         s[0] = xb[i2];
590         s[1] = xb[i2 + 1];
591         s[2] = xb[i2 + 2];
592         PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
593         x[i2]     = xw[0];
594         x[i2 + 1] = xw[1];
595         x[i2 + 2] = xw[2];
596         i2 -= 3;
597         idiag -= 9;
598         for (i = m - 2; i >= 0; i--) {
599           v    = aa + 9 * (diag[i] + 1);
600           vi   = aj + diag[i] + 1;
601           nz   = ai[i + 1] - diag[i] - 1;
602           s[0] = xb[i2];
603           s[1] = xb[i2 + 1];
604           s[2] = xb[i2 + 2];
605           while (nz--) {
606             idx   = 3 * (*vi++);
607             xw[0] = x[idx];
608             xw[1] = x[1 + idx];
609             xw[2] = x[2 + idx];
610             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
611             v += 9;
612           }
613           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
614           x[i2]     = xw[0];
615           x[i2 + 1] = xw[1];
616           x[i2 + 2] = xw[2];
617           idiag -= 9;
618           i2 -= 3;
619         }
620         break;
621       case 4:
622         s[0] = xb[i2];
623         s[1] = xb[i2 + 1];
624         s[2] = xb[i2 + 2];
625         s[3] = xb[i2 + 3];
626         PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
627         x[i2]     = xw[0];
628         x[i2 + 1] = xw[1];
629         x[i2 + 2] = xw[2];
630         x[i2 + 3] = xw[3];
631         i2 -= 4;
632         idiag -= 16;
633         for (i = m - 2; i >= 0; i--) {
634           v    = aa + 16 * (diag[i] + 1);
635           vi   = aj + diag[i] + 1;
636           nz   = ai[i + 1] - diag[i] - 1;
637           s[0] = xb[i2];
638           s[1] = xb[i2 + 1];
639           s[2] = xb[i2 + 2];
640           s[3] = xb[i2 + 3];
641           while (nz--) {
642             idx   = 4 * (*vi++);
643             xw[0] = x[idx];
644             xw[1] = x[1 + idx];
645             xw[2] = x[2 + idx];
646             xw[3] = x[3 + idx];
647             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
648             v += 16;
649           }
650           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
651           x[i2]     = xw[0];
652           x[i2 + 1] = xw[1];
653           x[i2 + 2] = xw[2];
654           x[i2 + 3] = xw[3];
655           idiag -= 16;
656           i2 -= 4;
657         }
658         break;
659       case 5:
660         s[0] = xb[i2];
661         s[1] = xb[i2 + 1];
662         s[2] = xb[i2 + 2];
663         s[3] = xb[i2 + 3];
664         s[4] = xb[i2 + 4];
665         PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
666         x[i2]     = xw[0];
667         x[i2 + 1] = xw[1];
668         x[i2 + 2] = xw[2];
669         x[i2 + 3] = xw[3];
670         x[i2 + 4] = xw[4];
671         i2 -= 5;
672         idiag -= 25;
673         for (i = m - 2; i >= 0; i--) {
674           v    = aa + 25 * (diag[i] + 1);
675           vi   = aj + diag[i] + 1;
676           nz   = ai[i + 1] - diag[i] - 1;
677           s[0] = xb[i2];
678           s[1] = xb[i2 + 1];
679           s[2] = xb[i2 + 2];
680           s[3] = xb[i2 + 3];
681           s[4] = xb[i2 + 4];
682           while (nz--) {
683             idx   = 5 * (*vi++);
684             xw[0] = x[idx];
685             xw[1] = x[1 + idx];
686             xw[2] = x[2 + idx];
687             xw[3] = x[3 + idx];
688             xw[4] = x[4 + idx];
689             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
690             v += 25;
691           }
692           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
693           x[i2]     = xw[0];
694           x[i2 + 1] = xw[1];
695           x[i2 + 2] = xw[2];
696           x[i2 + 3] = xw[3];
697           x[i2 + 4] = xw[4];
698           idiag -= 25;
699           i2 -= 5;
700         }
701         break;
702       case 6:
703         s[0] = xb[i2];
704         s[1] = xb[i2 + 1];
705         s[2] = xb[i2 + 2];
706         s[3] = xb[i2 + 3];
707         s[4] = xb[i2 + 4];
708         s[5] = xb[i2 + 5];
709         PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
710         x[i2]     = xw[0];
711         x[i2 + 1] = xw[1];
712         x[i2 + 2] = xw[2];
713         x[i2 + 3] = xw[3];
714         x[i2 + 4] = xw[4];
715         x[i2 + 5] = xw[5];
716         i2 -= 6;
717         idiag -= 36;
718         for (i = m - 2; i >= 0; i--) {
719           v    = aa + 36 * (diag[i] + 1);
720           vi   = aj + diag[i] + 1;
721           nz   = ai[i + 1] - diag[i] - 1;
722           s[0] = xb[i2];
723           s[1] = xb[i2 + 1];
724           s[2] = xb[i2 + 2];
725           s[3] = xb[i2 + 3];
726           s[4] = xb[i2 + 4];
727           s[5] = xb[i2 + 5];
728           while (nz--) {
729             idx   = 6 * (*vi++);
730             xw[0] = x[idx];
731             xw[1] = x[1 + idx];
732             xw[2] = x[2 + idx];
733             xw[3] = x[3 + idx];
734             xw[4] = x[4 + idx];
735             xw[5] = x[5 + idx];
736             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
737             v += 36;
738           }
739           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
740           x[i2]     = xw[0];
741           x[i2 + 1] = xw[1];
742           x[i2 + 2] = xw[2];
743           x[i2 + 3] = xw[3];
744           x[i2 + 4] = xw[4];
745           x[i2 + 5] = xw[5];
746           idiag -= 36;
747           i2 -= 6;
748         }
749         break;
750       case 7:
751         s[0] = xb[i2];
752         s[1] = xb[i2 + 1];
753         s[2] = xb[i2 + 2];
754         s[3] = xb[i2 + 3];
755         s[4] = xb[i2 + 4];
756         s[5] = xb[i2 + 5];
757         s[6] = xb[i2 + 6];
758         PetscKernel_v_gets_A_times_w_7(x, idiag, b);
759         x[i2]     = xw[0];
760         x[i2 + 1] = xw[1];
761         x[i2 + 2] = xw[2];
762         x[i2 + 3] = xw[3];
763         x[i2 + 4] = xw[4];
764         x[i2 + 5] = xw[5];
765         x[i2 + 6] = xw[6];
766         i2 -= 7;
767         idiag -= 49;
768         for (i = m - 2; i >= 0; i--) {
769           v    = aa + 49 * (diag[i] + 1);
770           vi   = aj + diag[i] + 1;
771           nz   = ai[i + 1] - diag[i] - 1;
772           s[0] = xb[i2];
773           s[1] = xb[i2 + 1];
774           s[2] = xb[i2 + 2];
775           s[3] = xb[i2 + 3];
776           s[4] = xb[i2 + 4];
777           s[5] = xb[i2 + 5];
778           s[6] = xb[i2 + 6];
779           while (nz--) {
780             idx   = 7 * (*vi++);
781             xw[0] = x[idx];
782             xw[1] = x[1 + idx];
783             xw[2] = x[2 + idx];
784             xw[3] = x[3 + idx];
785             xw[4] = x[4 + idx];
786             xw[5] = x[5 + idx];
787             xw[6] = x[6 + idx];
788             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
789             v += 49;
790           }
791           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
792           x[i2]     = xw[0];
793           x[i2 + 1] = xw[1];
794           x[i2 + 2] = xw[2];
795           x[i2 + 3] = xw[3];
796           x[i2 + 4] = xw[4];
797           x[i2 + 5] = xw[5];
798           x[i2 + 6] = xw[6];
799           idiag -= 49;
800           i2 -= 7;
801         }
802         break;
803       default:
804         PetscCall(PetscArraycpy(w, xb + i2, bs));
805         PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2);
806         i2 -= bs;
807         idiag -= bs2;
808         for (i = m - 2; i >= 0; i--) {
809           v  = aa + bs2 * (diag[i] + 1);
810           vi = aj + diag[i] + 1;
811           nz = ai[i + 1] - diag[i] - 1;
812 
813           PetscCall(PetscArraycpy(w, xb + i2, bs));
814           /* copy all rows of x that are needed into contiguous space */
815           workt = work;
816           for (j = 0; j < nz; j++) {
817             PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs));
818             workt += bs;
819           }
820           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
821           PetscKernel_w_gets_Ar_times_v(bs, bs, w, idiag, x + i2);
822 
823           idiag -= bs2;
824           i2 -= bs;
825         }
826         break;
827       }
828       PetscCall(PetscLogFlops(1.0 * bs2 * (a->nz)));
829     }
830     its--;
831   }
832   while (its--) {
833     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
834       idiag = a->idiag;
835       i2    = 0;
836       switch (bs) {
837       case 1:
838         for (i = 0; i < m; i++) {
839           v    = aa + ai[i];
840           vi   = aj + ai[i];
841           nz   = ai[i + 1] - ai[i];
842           s[0] = b[i2];
843           for (j = 0; j < nz; j++) {
844             xw[0] = x[vi[j]];
845             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
846           }
847           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
848           x[i2] += xw[0];
849           idiag += 1;
850           i2 += 1;
851         }
852         break;
853       case 2:
854         for (i = 0; i < m; i++) {
855           v    = aa + 4 * ai[i];
856           vi   = aj + ai[i];
857           nz   = ai[i + 1] - ai[i];
858           s[0] = b[i2];
859           s[1] = b[i2 + 1];
860           for (j = 0; j < nz; j++) {
861             idx   = 2 * vi[j];
862             it    = 4 * j;
863             xw[0] = x[idx];
864             xw[1] = x[1 + idx];
865             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
866           }
867           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
868           x[i2] += xw[0];
869           x[i2 + 1] += xw[1];
870           idiag += 4;
871           i2 += 2;
872         }
873         break;
874       case 3:
875         for (i = 0; i < m; i++) {
876           v    = aa + 9 * ai[i];
877           vi   = aj + ai[i];
878           nz   = ai[i + 1] - ai[i];
879           s[0] = b[i2];
880           s[1] = b[i2 + 1];
881           s[2] = b[i2 + 2];
882           while (nz--) {
883             idx   = 3 * (*vi++);
884             xw[0] = x[idx];
885             xw[1] = x[1 + idx];
886             xw[2] = x[2 + idx];
887             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
888             v += 9;
889           }
890           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
891           x[i2] += xw[0];
892           x[i2 + 1] += xw[1];
893           x[i2 + 2] += xw[2];
894           idiag += 9;
895           i2 += 3;
896         }
897         break;
898       case 4:
899         for (i = 0; i < m; i++) {
900           v    = aa + 16 * ai[i];
901           vi   = aj + ai[i];
902           nz   = ai[i + 1] - ai[i];
903           s[0] = b[i2];
904           s[1] = b[i2 + 1];
905           s[2] = b[i2 + 2];
906           s[3] = b[i2 + 3];
907           while (nz--) {
908             idx   = 4 * (*vi++);
909             xw[0] = x[idx];
910             xw[1] = x[1 + idx];
911             xw[2] = x[2 + idx];
912             xw[3] = x[3 + idx];
913             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
914             v += 16;
915           }
916           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
917           x[i2] += xw[0];
918           x[i2 + 1] += xw[1];
919           x[i2 + 2] += xw[2];
920           x[i2 + 3] += xw[3];
921           idiag += 16;
922           i2 += 4;
923         }
924         break;
925       case 5:
926         for (i = 0; i < m; i++) {
927           v    = aa + 25 * ai[i];
928           vi   = aj + ai[i];
929           nz   = ai[i + 1] - ai[i];
930           s[0] = b[i2];
931           s[1] = b[i2 + 1];
932           s[2] = b[i2 + 2];
933           s[3] = b[i2 + 3];
934           s[4] = b[i2 + 4];
935           while (nz--) {
936             idx   = 5 * (*vi++);
937             xw[0] = x[idx];
938             xw[1] = x[1 + idx];
939             xw[2] = x[2 + idx];
940             xw[3] = x[3 + idx];
941             xw[4] = x[4 + idx];
942             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
943             v += 25;
944           }
945           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
946           x[i2] += xw[0];
947           x[i2 + 1] += xw[1];
948           x[i2 + 2] += xw[2];
949           x[i2 + 3] += xw[3];
950           x[i2 + 4] += xw[4];
951           idiag += 25;
952           i2 += 5;
953         }
954         break;
955       case 6:
956         for (i = 0; i < m; i++) {
957           v    = aa + 36 * ai[i];
958           vi   = aj + ai[i];
959           nz   = ai[i + 1] - ai[i];
960           s[0] = b[i2];
961           s[1] = b[i2 + 1];
962           s[2] = b[i2 + 2];
963           s[3] = b[i2 + 3];
964           s[4] = b[i2 + 4];
965           s[5] = b[i2 + 5];
966           while (nz--) {
967             idx   = 6 * (*vi++);
968             xw[0] = x[idx];
969             xw[1] = x[1 + idx];
970             xw[2] = x[2 + idx];
971             xw[3] = x[3 + idx];
972             xw[4] = x[4 + idx];
973             xw[5] = x[5 + idx];
974             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
975             v += 36;
976           }
977           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
978           x[i2] += xw[0];
979           x[i2 + 1] += xw[1];
980           x[i2 + 2] += xw[2];
981           x[i2 + 3] += xw[3];
982           x[i2 + 4] += xw[4];
983           x[i2 + 5] += xw[5];
984           idiag += 36;
985           i2 += 6;
986         }
987         break;
988       case 7:
989         for (i = 0; i < m; i++) {
990           v    = aa + 49 * ai[i];
991           vi   = aj + ai[i];
992           nz   = ai[i + 1] - ai[i];
993           s[0] = b[i2];
994           s[1] = b[i2 + 1];
995           s[2] = b[i2 + 2];
996           s[3] = b[i2 + 3];
997           s[4] = b[i2 + 4];
998           s[5] = b[i2 + 5];
999           s[6] = b[i2 + 6];
1000           while (nz--) {
1001             idx   = 7 * (*vi++);
1002             xw[0] = x[idx];
1003             xw[1] = x[1 + idx];
1004             xw[2] = x[2 + idx];
1005             xw[3] = x[3 + idx];
1006             xw[4] = x[4 + idx];
1007             xw[5] = x[5 + idx];
1008             xw[6] = x[6 + idx];
1009             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
1010             v += 49;
1011           }
1012           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
1013           x[i2] += xw[0];
1014           x[i2 + 1] += xw[1];
1015           x[i2 + 2] += xw[2];
1016           x[i2 + 3] += xw[3];
1017           x[i2 + 4] += xw[4];
1018           x[i2 + 5] += xw[5];
1019           x[i2 + 6] += xw[6];
1020           idiag += 49;
1021           i2 += 7;
1022         }
1023         break;
1024       default:
1025         for (i = 0; i < m; i++) {
1026           v  = aa + bs2 * ai[i];
1027           vi = aj + ai[i];
1028           nz = ai[i + 1] - ai[i];
1029 
1030           PetscCall(PetscArraycpy(w, b + i2, bs));
1031           /* copy all rows of x that are needed into contiguous space */
1032           workt = work;
1033           for (j = 0; j < nz; j++) {
1034             PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs));
1035             workt += bs;
1036           }
1037           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
1038           PetscKernel_w_gets_w_plus_Ar_times_v(bs, bs, w, idiag, x + i2);
1039 
1040           idiag += bs2;
1041           i2 += bs;
1042         }
1043         break;
1044       }
1045       PetscCall(PetscLogFlops(2.0 * bs2 * a->nz));
1046     }
1047     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
1048       idiag = a->idiag + bs2 * (a->mbs - 1);
1049       i2    = bs * (m - 1);
1050       switch (bs) {
1051       case 1:
1052         for (i = m - 1; i >= 0; i--) {
1053           v    = aa + ai[i];
1054           vi   = aj + ai[i];
1055           nz   = ai[i + 1] - ai[i];
1056           s[0] = b[i2];
1057           for (j = 0; j < nz; j++) {
1058             xw[0] = x[vi[j]];
1059             PetscKernel_v_gets_v_minus_A_times_w_1(s, (v + j), xw);
1060           }
1061           PetscKernel_v_gets_A_times_w_1(xw, idiag, s);
1062           x[i2] += xw[0];
1063           idiag -= 1;
1064           i2 -= 1;
1065         }
1066         break;
1067       case 2:
1068         for (i = m - 1; i >= 0; i--) {
1069           v    = aa + 4 * ai[i];
1070           vi   = aj + ai[i];
1071           nz   = ai[i + 1] - ai[i];
1072           s[0] = b[i2];
1073           s[1] = b[i2 + 1];
1074           for (j = 0; j < nz; j++) {
1075             idx   = 2 * vi[j];
1076             it    = 4 * j;
1077             xw[0] = x[idx];
1078             xw[1] = x[1 + idx];
1079             PetscKernel_v_gets_v_minus_A_times_w_2(s, (v + it), xw);
1080           }
1081           PetscKernel_v_gets_A_times_w_2(xw, idiag, s);
1082           x[i2] += xw[0];
1083           x[i2 + 1] += xw[1];
1084           idiag -= 4;
1085           i2 -= 2;
1086         }
1087         break;
1088       case 3:
1089         for (i = m - 1; i >= 0; i--) {
1090           v    = aa + 9 * ai[i];
1091           vi   = aj + ai[i];
1092           nz   = ai[i + 1] - ai[i];
1093           s[0] = b[i2];
1094           s[1] = b[i2 + 1];
1095           s[2] = b[i2 + 2];
1096           while (nz--) {
1097             idx   = 3 * (*vi++);
1098             xw[0] = x[idx];
1099             xw[1] = x[1 + idx];
1100             xw[2] = x[2 + idx];
1101             PetscKernel_v_gets_v_minus_A_times_w_3(s, v, xw);
1102             v += 9;
1103           }
1104           PetscKernel_v_gets_A_times_w_3(xw, idiag, s);
1105           x[i2] += xw[0];
1106           x[i2 + 1] += xw[1];
1107           x[i2 + 2] += xw[2];
1108           idiag -= 9;
1109           i2 -= 3;
1110         }
1111         break;
1112       case 4:
1113         for (i = m - 1; i >= 0; i--) {
1114           v    = aa + 16 * ai[i];
1115           vi   = aj + ai[i];
1116           nz   = ai[i + 1] - ai[i];
1117           s[0] = b[i2];
1118           s[1] = b[i2 + 1];
1119           s[2] = b[i2 + 2];
1120           s[3] = b[i2 + 3];
1121           while (nz--) {
1122             idx   = 4 * (*vi++);
1123             xw[0] = x[idx];
1124             xw[1] = x[1 + idx];
1125             xw[2] = x[2 + idx];
1126             xw[3] = x[3 + idx];
1127             PetscKernel_v_gets_v_minus_A_times_w_4(s, v, xw);
1128             v += 16;
1129           }
1130           PetscKernel_v_gets_A_times_w_4(xw, idiag, s);
1131           x[i2] += xw[0];
1132           x[i2 + 1] += xw[1];
1133           x[i2 + 2] += xw[2];
1134           x[i2 + 3] += xw[3];
1135           idiag -= 16;
1136           i2 -= 4;
1137         }
1138         break;
1139       case 5:
1140         for (i = m - 1; i >= 0; i--) {
1141           v    = aa + 25 * ai[i];
1142           vi   = aj + ai[i];
1143           nz   = ai[i + 1] - ai[i];
1144           s[0] = b[i2];
1145           s[1] = b[i2 + 1];
1146           s[2] = b[i2 + 2];
1147           s[3] = b[i2 + 3];
1148           s[4] = b[i2 + 4];
1149           while (nz--) {
1150             idx   = 5 * (*vi++);
1151             xw[0] = x[idx];
1152             xw[1] = x[1 + idx];
1153             xw[2] = x[2 + idx];
1154             xw[3] = x[3 + idx];
1155             xw[4] = x[4 + idx];
1156             PetscKernel_v_gets_v_minus_A_times_w_5(s, v, xw);
1157             v += 25;
1158           }
1159           PetscKernel_v_gets_A_times_w_5(xw, idiag, s);
1160           x[i2] += xw[0];
1161           x[i2 + 1] += xw[1];
1162           x[i2 + 2] += xw[2];
1163           x[i2 + 3] += xw[3];
1164           x[i2 + 4] += xw[4];
1165           idiag -= 25;
1166           i2 -= 5;
1167         }
1168         break;
1169       case 6:
1170         for (i = m - 1; i >= 0; i--) {
1171           v    = aa + 36 * ai[i];
1172           vi   = aj + ai[i];
1173           nz   = ai[i + 1] - ai[i];
1174           s[0] = b[i2];
1175           s[1] = b[i2 + 1];
1176           s[2] = b[i2 + 2];
1177           s[3] = b[i2 + 3];
1178           s[4] = b[i2 + 4];
1179           s[5] = b[i2 + 5];
1180           while (nz--) {
1181             idx   = 6 * (*vi++);
1182             xw[0] = x[idx];
1183             xw[1] = x[1 + idx];
1184             xw[2] = x[2 + idx];
1185             xw[3] = x[3 + idx];
1186             xw[4] = x[4 + idx];
1187             xw[5] = x[5 + idx];
1188             PetscKernel_v_gets_v_minus_A_times_w_6(s, v, xw);
1189             v += 36;
1190           }
1191           PetscKernel_v_gets_A_times_w_6(xw, idiag, s);
1192           x[i2] += xw[0];
1193           x[i2 + 1] += xw[1];
1194           x[i2 + 2] += xw[2];
1195           x[i2 + 3] += xw[3];
1196           x[i2 + 4] += xw[4];
1197           x[i2 + 5] += xw[5];
1198           idiag -= 36;
1199           i2 -= 6;
1200         }
1201         break;
1202       case 7:
1203         for (i = m - 1; i >= 0; i--) {
1204           v    = aa + 49 * ai[i];
1205           vi   = aj + ai[i];
1206           nz   = ai[i + 1] - ai[i];
1207           s[0] = b[i2];
1208           s[1] = b[i2 + 1];
1209           s[2] = b[i2 + 2];
1210           s[3] = b[i2 + 3];
1211           s[4] = b[i2 + 4];
1212           s[5] = b[i2 + 5];
1213           s[6] = b[i2 + 6];
1214           while (nz--) {
1215             idx   = 7 * (*vi++);
1216             xw[0] = x[idx];
1217             xw[1] = x[1 + idx];
1218             xw[2] = x[2 + idx];
1219             xw[3] = x[3 + idx];
1220             xw[4] = x[4 + idx];
1221             xw[5] = x[5 + idx];
1222             xw[6] = x[6 + idx];
1223             PetscKernel_v_gets_v_minus_A_times_w_7(s, v, xw);
1224             v += 49;
1225           }
1226           PetscKernel_v_gets_A_times_w_7(xw, idiag, s);
1227           x[i2] += xw[0];
1228           x[i2 + 1] += xw[1];
1229           x[i2 + 2] += xw[2];
1230           x[i2 + 3] += xw[3];
1231           x[i2 + 4] += xw[4];
1232           x[i2 + 5] += xw[5];
1233           x[i2 + 6] += xw[6];
1234           idiag -= 49;
1235           i2 -= 7;
1236         }
1237         break;
1238       default:
1239         for (i = m - 1; i >= 0; i--) {
1240           v  = aa + bs2 * ai[i];
1241           vi = aj + ai[i];
1242           nz = ai[i + 1] - ai[i];
1243 
1244           PetscCall(PetscArraycpy(w, b + i2, bs));
1245           /* copy all rows of x that are needed into contiguous space */
1246           workt = work;
1247           for (j = 0; j < nz; j++) {
1248             PetscCall(PetscArraycpy(workt, x + bs * (*vi++), bs));
1249             workt += bs;
1250           }
1251           PetscKernel_w_gets_w_minus_Ar_times_v(bs, bs * nz, w, v, work);
1252           PetscKernel_w_gets_w_plus_Ar_times_v(bs, bs, w, idiag, x + i2);
1253 
1254           idiag -= bs2;
1255           i2 -= bs;
1256         }
1257         break;
1258       }
1259       PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz)));
1260     }
1261   }
1262   PetscCall(VecRestoreArray(xx, &x));
1263   PetscCall(VecRestoreArrayRead(bb, &b));
1264   PetscFunctionReturn(0);
1265 }
1266 
1267 /*
1268     Special version for direct calls from Fortran (Used in PETSc-fun3d)
1269 */
1270 #if defined(PETSC_HAVE_FORTRAN_CAPS)
1271   #define matsetvaluesblocked4_ MATSETVALUESBLOCKED4
1272 #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
1273   #define matsetvaluesblocked4_ matsetvaluesblocked4
1274 #endif
1275 
1276 PETSC_EXTERN void matsetvaluesblocked4_(Mat *AA, PetscInt *mm, const PetscInt im[], PetscInt *nn, const PetscInt in[], const PetscScalar v[])
1277 {
1278   Mat                A = *AA;
1279   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1280   PetscInt          *rp, k, low, high, t, ii, jj, row, nrow, i, col, l, N, m = *mm, n = *nn;
1281   PetscInt          *ai = a->i, *ailen = a->ilen;
1282   PetscInt          *aj = a->j, stepval, lastcol = -1;
1283   const PetscScalar *value = v;
1284   MatScalar         *ap, *aa = a->a, *bap;
1285 
1286   PetscFunctionBegin;
1287   if (A->rmap->bs != 4) SETERRABORT(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Can only be called with a block size of 4");
1288   stepval = (n - 1) * 4;
1289   for (k = 0; k < m; k++) { /* loop over added rows */
1290     row  = im[k];
1291     rp   = aj + ai[row];
1292     ap   = aa + 16 * ai[row];
1293     nrow = ailen[row];
1294     low  = 0;
1295     high = nrow;
1296     for (l = 0; l < n; l++) { /* loop over added columns */
1297       col = in[l];
1298       if (col <= lastcol) low = 0;
1299       else high = nrow;
1300       lastcol = col;
1301       value   = v + k * (stepval + 4 + l) * 4;
1302       while (high - low > 7) {
1303         t = (low + high) / 2;
1304         if (rp[t] > col) high = t;
1305         else low = t;
1306       }
1307       for (i = low; i < high; i++) {
1308         if (rp[i] > col) break;
1309         if (rp[i] == col) {
1310           bap = ap + 16 * i;
1311           for (ii = 0; ii < 4; ii++, value += stepval) {
1312             for (jj = ii; jj < 16; jj += 4) bap[jj] += *value++;
1313           }
1314           goto noinsert2;
1315         }
1316       }
1317       N = nrow++ - 1;
1318       high++; /* added new column index thus must search to one higher than before */
1319       /* shift up all the later entries in this row */
1320       for (ii = N; ii >= i; ii--) {
1321         rp[ii + 1] = rp[ii];
1322         PetscCallVoid(PetscArraycpy(ap + 16 * (ii + 1), ap + 16 * (ii), 16));
1323       }
1324       if (N >= i) PetscCallVoid(PetscArrayzero(ap + 16 * i, 16));
1325       rp[i] = col;
1326       bap   = ap + 16 * i;
1327       for (ii = 0; ii < 4; ii++, value += stepval) {
1328         for (jj = ii; jj < 16; jj += 4) bap[jj] = *value++;
1329       }
1330     noinsert2:;
1331       low = i;
1332     }
1333     ailen[row] = nrow;
1334   }
1335   PetscFunctionReturnVoid();
1336 }
1337 
1338 #if defined(PETSC_HAVE_FORTRAN_CAPS)
1339   #define matsetvalues4_ MATSETVALUES4
1340 #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
1341   #define matsetvalues4_ matsetvalues4
1342 #endif
1343 
1344 PETSC_EXTERN void matsetvalues4_(Mat *AA, PetscInt *mm, PetscInt *im, PetscInt *nn, PetscInt *in, PetscScalar *v)
1345 {
1346   Mat          A = *AA;
1347   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1348   PetscInt    *rp, k, low, high, t, row, nrow, i, col, l, N, n = *nn, m = *mm;
1349   PetscInt    *ai = a->i, *ailen = a->ilen;
1350   PetscInt    *aj = a->j, brow, bcol;
1351   PetscInt     ridx, cidx, lastcol = -1;
1352   MatScalar   *ap, value, *aa      = a->a, *bap;
1353 
1354   PetscFunctionBegin;
1355   for (k = 0; k < m; k++) { /* loop over added rows */
1356     row  = im[k];
1357     brow = row / 4;
1358     rp   = aj + ai[brow];
1359     ap   = aa + 16 * ai[brow];
1360     nrow = ailen[brow];
1361     low  = 0;
1362     high = nrow;
1363     for (l = 0; l < n; l++) { /* loop over added columns */
1364       col   = in[l];
1365       bcol  = col / 4;
1366       ridx  = row % 4;
1367       cidx  = col % 4;
1368       value = v[l + k * n];
1369       if (col <= lastcol) low = 0;
1370       else high = nrow;
1371       lastcol = col;
1372       while (high - low > 7) {
1373         t = (low + high) / 2;
1374         if (rp[t] > bcol) high = t;
1375         else low = t;
1376       }
1377       for (i = low; i < high; i++) {
1378         if (rp[i] > bcol) break;
1379         if (rp[i] == bcol) {
1380           bap = ap + 16 * i + 4 * cidx + ridx;
1381           *bap += value;
1382           goto noinsert1;
1383         }
1384       }
1385       N = nrow++ - 1;
1386       high++; /* added new column thus must search to one higher than before */
1387       /* shift up all the later entries in this row */
1388       PetscCallVoid(PetscArraymove(rp + i + 1, rp + i, N - i + 1));
1389       PetscCallVoid(PetscArraymove(ap + 16 * i + 16, ap + 16 * i, 16 * (N - i + 1)));
1390       PetscCallVoid(PetscArrayzero(ap + 16 * i, 16));
1391       rp[i]                        = bcol;
1392       ap[16 * i + 4 * cidx + ridx] = value;
1393     noinsert1:;
1394       low = i;
1395     }
1396     ailen[brow] = nrow;
1397   }
1398   PetscFunctionReturnVoid();
1399 }
1400 
1401 /*
1402      Checks for missing diagonals
1403 */
1404 PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat A, PetscBool *missing, PetscInt *d)
1405 {
1406   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1407   PetscInt    *diag, *ii = a->i, i;
1408 
1409   PetscFunctionBegin;
1410   PetscCall(MatMarkDiagonal_SeqBAIJ(A));
1411   *missing = PETSC_FALSE;
1412   if (A->rmap->n > 0 && !ii) {
1413     *missing = PETSC_TRUE;
1414     if (d) *d = 0;
1415     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
1416   } else {
1417     PetscInt n;
1418     n    = PetscMin(a->mbs, a->nbs);
1419     diag = a->diag;
1420     for (i = 0; i < n; i++) {
1421       if (diag[i] >= ii[i + 1]) {
1422         *missing = PETSC_TRUE;
1423         if (d) *d = i;
1424         PetscCall(PetscInfo(A, "Matrix is missing block diagonal number %" PetscInt_FMT "\n", i));
1425         break;
1426       }
1427     }
1428   }
1429   PetscFunctionReturn(0);
1430 }
1431 
1432 PetscErrorCode MatMarkDiagonal_SeqBAIJ(Mat A)
1433 {
1434   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1435   PetscInt     i, j, m = a->mbs;
1436 
1437   PetscFunctionBegin;
1438   if (!a->diag) {
1439     PetscCall(PetscMalloc1(m, &a->diag));
1440     a->free_diag = PETSC_TRUE;
1441   }
1442   for (i = 0; i < m; i++) {
1443     a->diag[i] = a->i[i + 1];
1444     for (j = a->i[i]; j < a->i[i + 1]; j++) {
1445       if (a->j[j] == i) {
1446         a->diag[i] = j;
1447         break;
1448       }
1449     }
1450   }
1451   PetscFunctionReturn(0);
1452 }
1453 
1454 static PetscErrorCode MatGetRowIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *nn, const PetscInt *inia[], const PetscInt *inja[], PetscBool *done)
1455 {
1456   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1457   PetscInt     i, j, n = a->mbs, nz = a->i[n], *tia, *tja, bs = A->rmap->bs, k, l, cnt;
1458   PetscInt   **ia = (PetscInt **)inia, **ja = (PetscInt **)inja;
1459 
1460   PetscFunctionBegin;
1461   *nn = n;
1462   if (!ia) PetscFunctionReturn(0);
1463   if (symmetric) {
1464     PetscCall(MatToSymmetricIJ_SeqAIJ(n, a->i, a->j, PETSC_TRUE, 0, 0, &tia, &tja));
1465     nz = tia[n];
1466   } else {
1467     tia = a->i;
1468     tja = a->j;
1469   }
1470 
1471   if (!blockcompressed && bs > 1) {
1472     (*nn) *= bs;
1473     /* malloc & create the natural set of indices */
1474     PetscCall(PetscMalloc1((n + 1) * bs, ia));
1475     if (n) {
1476       (*ia)[0] = oshift;
1477       for (j = 1; j < bs; j++) (*ia)[j] = (tia[1] - tia[0]) * bs + (*ia)[j - 1];
1478     }
1479 
1480     for (i = 1; i < n; i++) {
1481       (*ia)[i * bs] = (tia[i] - tia[i - 1]) * bs + (*ia)[i * bs - 1];
1482       for (j = 1; j < bs; j++) (*ia)[i * bs + j] = (tia[i + 1] - tia[i]) * bs + (*ia)[i * bs + j - 1];
1483     }
1484     if (n) (*ia)[n * bs] = (tia[n] - tia[n - 1]) * bs + (*ia)[n * bs - 1];
1485 
1486     if (inja) {
1487       PetscCall(PetscMalloc1(nz * bs * bs, ja));
1488       cnt = 0;
1489       for (i = 0; i < n; i++) {
1490         for (j = 0; j < bs; j++) {
1491           for (k = tia[i]; k < tia[i + 1]; k++) {
1492             for (l = 0; l < bs; l++) (*ja)[cnt++] = bs * tja[k] + l;
1493           }
1494         }
1495       }
1496     }
1497 
1498     if (symmetric) { /* deallocate memory allocated in MatToSymmetricIJ_SeqAIJ() */
1499       PetscCall(PetscFree(tia));
1500       PetscCall(PetscFree(tja));
1501     }
1502   } else if (oshift == 1) {
1503     if (symmetric) {
1504       nz = tia[A->rmap->n / bs];
1505       /*  add 1 to i and j indices */
1506       for (i = 0; i < A->rmap->n / bs + 1; i++) tia[i] = tia[i] + 1;
1507       *ia = tia;
1508       if (ja) {
1509         for (i = 0; i < nz; i++) tja[i] = tja[i] + 1;
1510         *ja = tja;
1511       }
1512     } else {
1513       nz = a->i[A->rmap->n / bs];
1514       /* malloc space and  add 1 to i and j indices */
1515       PetscCall(PetscMalloc1(A->rmap->n / bs + 1, ia));
1516       for (i = 0; i < A->rmap->n / bs + 1; i++) (*ia)[i] = a->i[i] + 1;
1517       if (ja) {
1518         PetscCall(PetscMalloc1(nz, ja));
1519         for (i = 0; i < nz; i++) (*ja)[i] = a->j[i] + 1;
1520       }
1521     }
1522   } else {
1523     *ia = tia;
1524     if (ja) *ja = tja;
1525   }
1526   PetscFunctionReturn(0);
1527 }
1528 
1529 static PetscErrorCode MatRestoreRowIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
1530 {
1531   PetscFunctionBegin;
1532   if (!ia) PetscFunctionReturn(0);
1533   if ((!blockcompressed && A->rmap->bs > 1) || (symmetric || oshift == 1)) {
1534     PetscCall(PetscFree(*ia));
1535     if (ja) PetscCall(PetscFree(*ja));
1536   }
1537   PetscFunctionReturn(0);
1538 }
1539 
1540 PetscErrorCode MatDestroy_SeqBAIJ(Mat A)
1541 {
1542   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1543 
1544   PetscFunctionBegin;
1545 #if defined(PETSC_USE_LOG)
1546   PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->N, A->cmap->n, a->nz);
1547 #endif
1548   PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
1549   PetscCall(ISDestroy(&a->row));
1550   PetscCall(ISDestroy(&a->col));
1551   if (a->free_diag) PetscCall(PetscFree(a->diag));
1552   PetscCall(PetscFree(a->idiag));
1553   if (a->free_imax_ilen) PetscCall(PetscFree2(a->imax, a->ilen));
1554   PetscCall(PetscFree(a->solve_work));
1555   PetscCall(PetscFree(a->mult_work));
1556   PetscCall(PetscFree(a->sor_workt));
1557   PetscCall(PetscFree(a->sor_work));
1558   PetscCall(ISDestroy(&a->icol));
1559   PetscCall(PetscFree(a->saved_values));
1560   PetscCall(PetscFree2(a->compressedrow.i, a->compressedrow.rindex));
1561 
1562   PetscCall(MatDestroy(&a->sbaijMat));
1563   PetscCall(MatDestroy(&a->parent));
1564   PetscCall(PetscFree(A->data));
1565 
1566   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
1567   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJGetArray_C", NULL));
1568   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJRestoreArray_C", NULL));
1569   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
1570   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
1571   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetColumnIndices_C", NULL));
1572   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqaij_C", NULL));
1573   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqsbaij_C", NULL));
1574   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetPreallocation_C", NULL));
1575   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqBAIJSetPreallocationCSR_C", NULL));
1576   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_seqbstrm_C", NULL));
1577   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatIsTranspose_C", NULL));
1578 #if defined(PETSC_HAVE_HYPRE)
1579   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_hypre_C", NULL));
1580 #endif
1581   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqbaij_is_C", NULL));
1582   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
1583   PetscFunctionReturn(0);
1584 }
1585 
1586 PetscErrorCode MatSetOption_SeqBAIJ(Mat A, MatOption op, PetscBool flg)
1587 {
1588   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1589 
1590   PetscFunctionBegin;
1591   switch (op) {
1592   case MAT_ROW_ORIENTED:
1593     a->roworiented = flg;
1594     break;
1595   case MAT_KEEP_NONZERO_PATTERN:
1596     a->keepnonzeropattern = flg;
1597     break;
1598   case MAT_NEW_NONZERO_LOCATIONS:
1599     a->nonew = (flg ? 0 : 1);
1600     break;
1601   case MAT_NEW_NONZERO_LOCATION_ERR:
1602     a->nonew = (flg ? -1 : 0);
1603     break;
1604   case MAT_NEW_NONZERO_ALLOCATION_ERR:
1605     a->nonew = (flg ? -2 : 0);
1606     break;
1607   case MAT_UNUSED_NONZERO_LOCATION_ERR:
1608     a->nounused = (flg ? -1 : 0);
1609     break;
1610   case MAT_FORCE_DIAGONAL_ENTRIES:
1611   case MAT_IGNORE_OFF_PROC_ENTRIES:
1612   case MAT_USE_HASH_TABLE:
1613   case MAT_SORTED_FULL:
1614     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
1615     break;
1616   case MAT_SPD:
1617   case MAT_SYMMETRIC:
1618   case MAT_STRUCTURALLY_SYMMETRIC:
1619   case MAT_HERMITIAN:
1620   case MAT_SYMMETRY_ETERNAL:
1621   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
1622   case MAT_SUBMAT_SINGLEIS:
1623   case MAT_STRUCTURE_ONLY:
1624   case MAT_SPD_ETERNAL:
1625     /* if the diagonal matrix is square it inherits some of the properties above */
1626     break;
1627   default:
1628     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
1629   }
1630   PetscFunctionReturn(0);
1631 }
1632 
1633 /* used for both SeqBAIJ and SeqSBAIJ matrices */
1634 PetscErrorCode MatGetRow_SeqBAIJ_private(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v, PetscInt *ai, PetscInt *aj, PetscScalar *aa)
1635 {
1636   PetscInt     itmp, i, j, k, M, bn, bp, *idx_i, bs, bs2;
1637   MatScalar   *aa_i;
1638   PetscScalar *v_i;
1639 
1640   PetscFunctionBegin;
1641   bs  = A->rmap->bs;
1642   bs2 = bs * bs;
1643   PetscCheck(row >= 0 && row < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
1644 
1645   bn  = row / bs; /* Block number */
1646   bp  = row % bs; /* Block Position */
1647   M   = ai[bn + 1] - ai[bn];
1648   *nz = bs * M;
1649 
1650   if (v) {
1651     *v = NULL;
1652     if (*nz) {
1653       PetscCall(PetscMalloc1(*nz, v));
1654       for (i = 0; i < M; i++) { /* for each block in the block row */
1655         v_i  = *v + i * bs;
1656         aa_i = aa + bs2 * (ai[bn] + i);
1657         for (j = bp, k = 0; j < bs2; j += bs, k++) v_i[k] = aa_i[j];
1658       }
1659     }
1660   }
1661 
1662   if (idx) {
1663     *idx = NULL;
1664     if (*nz) {
1665       PetscCall(PetscMalloc1(*nz, idx));
1666       for (i = 0; i < M; i++) { /* for each block in the block row */
1667         idx_i = *idx + i * bs;
1668         itmp  = bs * aj[ai[bn] + i];
1669         for (j = 0; j < bs; j++) idx_i[j] = itmp++;
1670       }
1671     }
1672   }
1673   PetscFunctionReturn(0);
1674 }
1675 
1676 PetscErrorCode MatGetRow_SeqBAIJ(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1677 {
1678   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1679 
1680   PetscFunctionBegin;
1681   PetscCall(MatGetRow_SeqBAIJ_private(A, row, nz, idx, v, a->i, a->j, a->a));
1682   PetscFunctionReturn(0);
1683 }
1684 
1685 PetscErrorCode MatRestoreRow_SeqBAIJ(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
1686 {
1687   PetscFunctionBegin;
1688   if (nz) *nz = 0;
1689   if (idx) PetscCall(PetscFree(*idx));
1690   if (v) PetscCall(PetscFree(*v));
1691   PetscFunctionReturn(0);
1692 }
1693 
1694 PetscErrorCode MatTranspose_SeqBAIJ(Mat A, MatReuse reuse, Mat *B)
1695 {
1696   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *at;
1697   Mat          C;
1698   PetscInt     i, j, k, *aj = a->j, *ai = a->i, bs = A->rmap->bs, mbs = a->mbs, nbs = a->nbs, *atfill;
1699   PetscInt     bs2 = a->bs2, *ati, *atj, anzj, kr;
1700   MatScalar   *ata, *aa = a->a;
1701 
1702   PetscFunctionBegin;
1703   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
1704   PetscCall(PetscCalloc1(1 + nbs, &atfill));
1705   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
1706     for (i = 0; i < ai[mbs]; i++) atfill[aj[i]] += 1; /* count num of non-zeros in row aj[i] */
1707 
1708     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &C));
1709     PetscCall(MatSetSizes(C, A->cmap->n, A->rmap->N, A->cmap->n, A->rmap->N));
1710     PetscCall(MatSetType(C, ((PetscObject)A)->type_name));
1711     PetscCall(MatSeqBAIJSetPreallocation(C, bs, 0, atfill));
1712 
1713     at  = (Mat_SeqBAIJ *)C->data;
1714     ati = at->i;
1715     for (i = 0; i < nbs; i++) at->ilen[i] = at->imax[i] = ati[i + 1] - ati[i];
1716   } else {
1717     C   = *B;
1718     at  = (Mat_SeqBAIJ *)C->data;
1719     ati = at->i;
1720   }
1721 
1722   atj = at->j;
1723   ata = at->a;
1724 
1725   /* Copy ati into atfill so we have locations of the next free space in atj */
1726   PetscCall(PetscArraycpy(atfill, ati, nbs));
1727 
1728   /* Walk through A row-wise and mark nonzero entries of A^T. */
1729   for (i = 0; i < mbs; i++) {
1730     anzj = ai[i + 1] - ai[i];
1731     for (j = 0; j < anzj; j++) {
1732       atj[atfill[*aj]] = i;
1733       for (kr = 0; kr < bs; kr++) {
1734         for (k = 0; k < bs; k++) ata[bs2 * atfill[*aj] + k * bs + kr] = *aa++;
1735       }
1736       atfill[*aj++] += 1;
1737     }
1738   }
1739   PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY));
1740   PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY));
1741 
1742   /* Clean up temporary space and complete requests. */
1743   PetscCall(PetscFree(atfill));
1744 
1745   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_REUSE_MATRIX) {
1746     PetscCall(MatSetBlockSizes(C, PetscAbs(A->cmap->bs), PetscAbs(A->rmap->bs)));
1747     *B = C;
1748   } else {
1749     PetscCall(MatHeaderMerge(A, &C));
1750   }
1751   PetscFunctionReturn(0);
1752 }
1753 
1754 PetscErrorCode MatIsTranspose_SeqBAIJ(Mat A, Mat B, PetscReal tol, PetscBool *f)
1755 {
1756   Mat Btrans;
1757 
1758   PetscFunctionBegin;
1759   *f = PETSC_FALSE;
1760   PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &Btrans));
1761   PetscCall(MatEqual_SeqBAIJ(B, Btrans, f));
1762   PetscCall(MatDestroy(&Btrans));
1763   PetscFunctionReturn(0);
1764 }
1765 
1766 /* Used for both SeqBAIJ and SeqSBAIJ matrices */
1767 PetscErrorCode MatView_SeqBAIJ_Binary(Mat mat, PetscViewer viewer)
1768 {
1769   Mat_SeqBAIJ *A = (Mat_SeqBAIJ *)mat->data;
1770   PetscInt     header[4], M, N, m, bs, nz, cnt, i, j, k, l;
1771   PetscInt    *rowlens, *colidxs;
1772   PetscScalar *matvals;
1773 
1774   PetscFunctionBegin;
1775   PetscCall(PetscViewerSetUp(viewer));
1776 
1777   M  = mat->rmap->N;
1778   N  = mat->cmap->N;
1779   m  = mat->rmap->n;
1780   bs = mat->rmap->bs;
1781   nz = bs * bs * A->nz;
1782 
1783   /* write matrix header */
1784   header[0] = MAT_FILE_CLASSID;
1785   header[1] = M;
1786   header[2] = N;
1787   header[3] = nz;
1788   PetscCall(PetscViewerBinaryWrite(viewer, header, 4, PETSC_INT));
1789 
1790   /* store row lengths */
1791   PetscCall(PetscMalloc1(m, &rowlens));
1792   for (cnt = 0, i = 0; i < A->mbs; i++)
1793     for (j = 0; j < bs; j++) rowlens[cnt++] = bs * (A->i[i + 1] - A->i[i]);
1794   PetscCall(PetscViewerBinaryWrite(viewer, rowlens, m, PETSC_INT));
1795   PetscCall(PetscFree(rowlens));
1796 
1797   /* store column indices  */
1798   PetscCall(PetscMalloc1(nz, &colidxs));
1799   for (cnt = 0, i = 0; i < A->mbs; i++)
1800     for (k = 0; k < bs; k++)
1801       for (j = A->i[i]; j < A->i[i + 1]; j++)
1802         for (l = 0; l < bs; l++) colidxs[cnt++] = bs * A->j[j] + l;
1803   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_LIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt_FMT, cnt, nz);
1804   PetscCall(PetscViewerBinaryWrite(viewer, colidxs, nz, PETSC_INT));
1805   PetscCall(PetscFree(colidxs));
1806 
1807   /* store nonzero values */
1808   PetscCall(PetscMalloc1(nz, &matvals));
1809   for (cnt = 0, i = 0; i < A->mbs; i++)
1810     for (k = 0; k < bs; k++)
1811       for (j = A->i[i]; j < A->i[i + 1]; j++)
1812         for (l = 0; l < bs; l++) matvals[cnt++] = A->a[bs * (bs * j + l) + k];
1813   PetscCheck(cnt == nz, PETSC_COMM_SELF, PETSC_ERR_LIB, "Internal PETSc error: cnt = %" PetscInt_FMT " nz = %" PetscInt_FMT, cnt, nz);
1814   PetscCall(PetscViewerBinaryWrite(viewer, matvals, nz, PETSC_SCALAR));
1815   PetscCall(PetscFree(matvals));
1816 
1817   /* write block size option to the viewer's .info file */
1818   PetscCall(MatView_Binary_BlockSizes(mat, viewer));
1819   PetscFunctionReturn(0);
1820 }
1821 
1822 static PetscErrorCode MatView_SeqBAIJ_ASCII_structonly(Mat A, PetscViewer viewer)
1823 {
1824   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
1825   PetscInt     i, bs = A->rmap->bs, k;
1826 
1827   PetscFunctionBegin;
1828   PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1829   for (i = 0; i < a->mbs; i++) {
1830     PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT "-%" PetscInt_FMT ":", i * bs, i * bs + bs - 1));
1831     for (k = a->i[i]; k < a->i[i + 1]; k++) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT "-%" PetscInt_FMT ") ", bs * a->j[k], bs * a->j[k] + bs - 1));
1832     PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1833   }
1834   PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1835   PetscFunctionReturn(0);
1836 }
1837 
1838 static PetscErrorCode MatView_SeqBAIJ_ASCII(Mat A, PetscViewer viewer)
1839 {
1840   Mat_SeqBAIJ      *a = (Mat_SeqBAIJ *)A->data;
1841   PetscInt          i, j, bs = A->rmap->bs, k, l, bs2 = a->bs2;
1842   PetscViewerFormat format;
1843 
1844   PetscFunctionBegin;
1845   if (A->structure_only) {
1846     PetscCall(MatView_SeqBAIJ_ASCII_structonly(A, viewer));
1847     PetscFunctionReturn(0);
1848   }
1849 
1850   PetscCall(PetscViewerGetFormat(viewer, &format));
1851   if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
1852     PetscCall(PetscViewerASCIIPrintf(viewer, "  block size is %" PetscInt_FMT "\n", bs));
1853   } else if (format == PETSC_VIEWER_ASCII_MATLAB) {
1854     const char *matname;
1855     Mat         aij;
1856     PetscCall(MatConvert(A, MATSEQAIJ, MAT_INITIAL_MATRIX, &aij));
1857     PetscCall(PetscObjectGetName((PetscObject)A, &matname));
1858     PetscCall(PetscObjectSetName((PetscObject)aij, matname));
1859     PetscCall(MatView(aij, viewer));
1860     PetscCall(MatDestroy(&aij));
1861   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO) {
1862     PetscFunctionReturn(0);
1863   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1864     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1865     for (i = 0; i < a->mbs; i++) {
1866       for (j = 0; j < bs; j++) {
1867         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i * bs + j));
1868         for (k = a->i[i]; k < a->i[i + 1]; k++) {
1869           for (l = 0; l < bs; l++) {
1870 #if defined(PETSC_USE_COMPLEX)
1871             if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) > 0.0 && PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) {
1872               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %gi) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), (double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j])));
1873             } else if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) < 0.0 && PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) {
1874               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %gi) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), -(double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j])));
1875             } else if (PetscRealPart(a->a[bs2 * k + l * bs + j]) != 0.0) {
1876               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j])));
1877             }
1878 #else
1879             if (a->a[bs2 * k + l * bs + j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)a->a[bs2 * k + l * bs + j]));
1880 #endif
1881           }
1882         }
1883         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1884       }
1885     }
1886     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1887   } else {
1888     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1889     for (i = 0; i < a->mbs; i++) {
1890       for (j = 0; j < bs; j++) {
1891         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i * bs + j));
1892         for (k = a->i[i]; k < a->i[i + 1]; k++) {
1893           for (l = 0; l < bs; l++) {
1894 #if defined(PETSC_USE_COMPLEX)
1895             if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) > 0.0) {
1896               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), (double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j])));
1897             } else if (PetscImaginaryPart(a->a[bs2 * k + l * bs + j]) < 0.0) {
1898               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j]), -(double)PetscImaginaryPart(a->a[bs2 * k + l * bs + j])));
1899             } else {
1900               PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)PetscRealPart(a->a[bs2 * k + l * bs + j])));
1901             }
1902 #else
1903             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", bs * a->j[k] + l, (double)a->a[bs2 * k + l * bs + j]));
1904 #endif
1905           }
1906         }
1907         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1908       }
1909     }
1910     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1911   }
1912   PetscCall(PetscViewerFlush(viewer));
1913   PetscFunctionReturn(0);
1914 }
1915 
1916 #include <petscdraw.h>
1917 static PetscErrorCode MatView_SeqBAIJ_Draw_Zoom(PetscDraw draw, void *Aa)
1918 {
1919   Mat               A = (Mat)Aa;
1920   Mat_SeqBAIJ      *a = (Mat_SeqBAIJ *)A->data;
1921   PetscInt          row, i, j, k, l, mbs = a->mbs, color, bs = A->rmap->bs, bs2 = a->bs2;
1922   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1923   MatScalar        *aa;
1924   PetscViewer       viewer;
1925   PetscViewerFormat format;
1926 
1927   PetscFunctionBegin;
1928   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1929   PetscCall(PetscViewerGetFormat(viewer, &format));
1930   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1931 
1932   /* loop over matrix elements drawing boxes */
1933 
1934   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1935     PetscDrawCollectiveBegin(draw);
1936     /* Blue for negative, Cyan for zero and  Red for positive */
1937     color = PETSC_DRAW_BLUE;
1938     for (i = 0, row = 0; i < mbs; i++, row += bs) {
1939       for (j = a->i[i]; j < a->i[i + 1]; j++) {
1940         y_l = A->rmap->N - row - 1.0;
1941         y_r = y_l + 1.0;
1942         x_l = a->j[j] * bs;
1943         x_r = x_l + 1.0;
1944         aa  = a->a + j * bs2;
1945         for (k = 0; k < bs; k++) {
1946           for (l = 0; l < bs; l++) {
1947             if (PetscRealPart(*aa++) >= 0.) continue;
1948             PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color));
1949           }
1950         }
1951       }
1952     }
1953     color = PETSC_DRAW_CYAN;
1954     for (i = 0, row = 0; i < mbs; i++, row += bs) {
1955       for (j = a->i[i]; j < a->i[i + 1]; j++) {
1956         y_l = A->rmap->N - row - 1.0;
1957         y_r = y_l + 1.0;
1958         x_l = a->j[j] * bs;
1959         x_r = x_l + 1.0;
1960         aa  = a->a + j * bs2;
1961         for (k = 0; k < bs; k++) {
1962           for (l = 0; l < bs; l++) {
1963             if (PetscRealPart(*aa++) != 0.) continue;
1964             PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color));
1965           }
1966         }
1967       }
1968     }
1969     color = PETSC_DRAW_RED;
1970     for (i = 0, row = 0; i < mbs; i++, row += bs) {
1971       for (j = a->i[i]; j < a->i[i + 1]; j++) {
1972         y_l = A->rmap->N - row - 1.0;
1973         y_r = y_l + 1.0;
1974         x_l = a->j[j] * bs;
1975         x_r = x_l + 1.0;
1976         aa  = a->a + j * bs2;
1977         for (k = 0; k < bs; k++) {
1978           for (l = 0; l < bs; l++) {
1979             if (PetscRealPart(*aa++) <= 0.) continue;
1980             PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color));
1981           }
1982         }
1983       }
1984     }
1985     PetscDrawCollectiveEnd(draw);
1986   } else {
1987     /* use contour shading to indicate magnitude of values */
1988     /* first determine max of all nonzero values */
1989     PetscReal minv = 0.0, maxv = 0.0;
1990     PetscDraw popup;
1991 
1992     for (i = 0; i < a->nz * a->bs2; i++) {
1993       if (PetscAbsScalar(a->a[i]) > maxv) maxv = PetscAbsScalar(a->a[i]);
1994     }
1995     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1996     PetscCall(PetscDrawGetPopup(draw, &popup));
1997     PetscCall(PetscDrawScalePopup(popup, 0.0, maxv));
1998 
1999     PetscDrawCollectiveBegin(draw);
2000     for (i = 0, row = 0; i < mbs; i++, row += bs) {
2001       for (j = a->i[i]; j < a->i[i + 1]; j++) {
2002         y_l = A->rmap->N - row - 1.0;
2003         y_r = y_l + 1.0;
2004         x_l = a->j[j] * bs;
2005         x_r = x_l + 1.0;
2006         aa  = a->a + j * bs2;
2007         for (k = 0; k < bs; k++) {
2008           for (l = 0; l < bs; l++) {
2009             MatScalar v = *aa++;
2010             color       = PetscDrawRealToColor(PetscAbsScalar(v), minv, maxv);
2011             PetscCall(PetscDrawRectangle(draw, x_l + k, y_l - l, x_r + k, y_r - l, color, color, color, color));
2012           }
2013         }
2014       }
2015     }
2016     PetscDrawCollectiveEnd(draw);
2017   }
2018   PetscFunctionReturn(0);
2019 }
2020 
2021 static PetscErrorCode MatView_SeqBAIJ_Draw(Mat A, PetscViewer viewer)
2022 {
2023   PetscReal xl, yl, xr, yr, w, h;
2024   PetscDraw draw;
2025   PetscBool isnull;
2026 
2027   PetscFunctionBegin;
2028   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
2029   PetscCall(PetscDrawIsNull(draw, &isnull));
2030   if (isnull) PetscFunctionReturn(0);
2031 
2032   xr = A->cmap->n;
2033   yr = A->rmap->N;
2034   h  = yr / 10.0;
2035   w  = xr / 10.0;
2036   xr += w;
2037   yr += h;
2038   xl = -w;
2039   yl = -h;
2040   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
2041   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
2042   PetscCall(PetscDrawZoom(draw, MatView_SeqBAIJ_Draw_Zoom, A));
2043   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
2044   PetscCall(PetscDrawSave(draw));
2045   PetscFunctionReturn(0);
2046 }
2047 
2048 PetscErrorCode MatView_SeqBAIJ(Mat A, PetscViewer viewer)
2049 {
2050   PetscBool iascii, isbinary, isdraw;
2051 
2052   PetscFunctionBegin;
2053   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
2054   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
2055   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
2056   if (iascii) {
2057     PetscCall(MatView_SeqBAIJ_ASCII(A, viewer));
2058   } else if (isbinary) {
2059     PetscCall(MatView_SeqBAIJ_Binary(A, viewer));
2060   } else if (isdraw) {
2061     PetscCall(MatView_SeqBAIJ_Draw(A, viewer));
2062   } else {
2063     Mat B;
2064     PetscCall(MatConvert(A, MATSEQAIJ, MAT_INITIAL_MATRIX, &B));
2065     PetscCall(MatView(B, viewer));
2066     PetscCall(MatDestroy(&B));
2067   }
2068   PetscFunctionReturn(0);
2069 }
2070 
2071 PetscErrorCode MatGetValues_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
2072 {
2073   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2074   PetscInt    *rp, k, low, high, t, row, nrow, i, col, l, *aj = a->j;
2075   PetscInt    *ai = a->i, *ailen = a->ilen;
2076   PetscInt     brow, bcol, ridx, cidx, bs = A->rmap->bs, bs2 = a->bs2;
2077   MatScalar   *ap, *aa = a->a;
2078 
2079   PetscFunctionBegin;
2080   for (k = 0; k < m; k++) { /* loop over rows */
2081     row  = im[k];
2082     brow = row / bs;
2083     if (row < 0) {
2084       v += n;
2085       continue;
2086     } /* negative row */
2087     PetscCheck(row < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " too large", row);
2088     rp   = aj ? aj + ai[brow] : NULL;       /* mustn't add to NULL, that is UB */
2089     ap   = aa ? aa + bs2 * ai[brow] : NULL; /* mustn't add to NULL, that is UB */
2090     nrow = ailen[brow];
2091     for (l = 0; l < n; l++) { /* loop over columns */
2092       if (in[l] < 0) {
2093         v++;
2094         continue;
2095       } /* negative column */
2096       PetscCheck(in[l] < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column %" PetscInt_FMT " too large", in[l]);
2097       col  = in[l];
2098       bcol = col / bs;
2099       cidx = col % bs;
2100       ridx = row % bs;
2101       high = nrow;
2102       low  = 0; /* assume unsorted */
2103       while (high - low > 5) {
2104         t = (low + high) / 2;
2105         if (rp[t] > bcol) high = t;
2106         else low = t;
2107       }
2108       for (i = low; i < high; i++) {
2109         if (rp[i] > bcol) break;
2110         if (rp[i] == bcol) {
2111           *v++ = ap[bs2 * i + bs * cidx + ridx];
2112           goto finished;
2113         }
2114       }
2115       *v++ = 0.0;
2116     finished:;
2117     }
2118   }
2119   PetscFunctionReturn(0);
2120 }
2121 
2122 PetscErrorCode MatSetValuesBlocked_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
2123 {
2124   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2125   PetscInt          *rp, k, low, high, t, ii, jj, row, nrow, i, col, l, rmax, N, lastcol = -1;
2126   PetscInt          *imax = a->imax, *ai = a->i, *ailen = a->ilen;
2127   PetscInt          *aj = a->j, nonew = a->nonew, bs2 = a->bs2, bs = A->rmap->bs, stepval;
2128   PetscBool          roworiented = a->roworiented;
2129   const PetscScalar *value       = v;
2130   MatScalar         *ap = NULL, *aa = a->a, *bap;
2131 
2132   PetscFunctionBegin;
2133   if (roworiented) {
2134     stepval = (n - 1) * bs;
2135   } else {
2136     stepval = (m - 1) * bs;
2137   }
2138   for (k = 0; k < m; k++) { /* loop over added rows */
2139     row = im[k];
2140     if (row < 0) continue;
2141     PetscCheck(row < a->mbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Block row index too large %" PetscInt_FMT " max %" PetscInt_FMT, row, a->mbs - 1);
2142     rp = aj + ai[row];
2143     if (!A->structure_only) ap = aa + bs2 * ai[row];
2144     rmax = imax[row];
2145     nrow = ailen[row];
2146     low  = 0;
2147     high = nrow;
2148     for (l = 0; l < n; l++) { /* loop over added columns */
2149       if (in[l] < 0) continue;
2150       PetscCheck(in[l] < a->nbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Block column index too large %" PetscInt_FMT " max %" PetscInt_FMT, in[l], a->nbs - 1);
2151       col = in[l];
2152       if (!A->structure_only) {
2153         if (roworiented) {
2154           value = v + (k * (stepval + bs) + l) * bs;
2155         } else {
2156           value = v + (l * (stepval + bs) + k) * bs;
2157         }
2158       }
2159       if (col <= lastcol) low = 0;
2160       else high = nrow;
2161       lastcol = col;
2162       while (high - low > 7) {
2163         t = (low + high) / 2;
2164         if (rp[t] > col) high = t;
2165         else low = t;
2166       }
2167       for (i = low; i < high; i++) {
2168         if (rp[i] > col) break;
2169         if (rp[i] == col) {
2170           if (A->structure_only) goto noinsert2;
2171           bap = ap + bs2 * i;
2172           if (roworiented) {
2173             if (is == ADD_VALUES) {
2174               for (ii = 0; ii < bs; ii++, value += stepval) {
2175                 for (jj = ii; jj < bs2; jj += bs) bap[jj] += *value++;
2176               }
2177             } else {
2178               for (ii = 0; ii < bs; ii++, value += stepval) {
2179                 for (jj = ii; jj < bs2; jj += bs) bap[jj] = *value++;
2180               }
2181             }
2182           } else {
2183             if (is == ADD_VALUES) {
2184               for (ii = 0; ii < bs; ii++, value += bs + stepval) {
2185                 for (jj = 0; jj < bs; jj++) bap[jj] += value[jj];
2186                 bap += bs;
2187               }
2188             } else {
2189               for (ii = 0; ii < bs; ii++, value += bs + stepval) {
2190                 for (jj = 0; jj < bs; jj++) bap[jj] = value[jj];
2191                 bap += bs;
2192               }
2193             }
2194           }
2195           goto noinsert2;
2196         }
2197       }
2198       if (nonew == 1) goto noinsert2;
2199       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new blocked index new nonzero block (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
2200       if (A->structure_only) {
2201         MatSeqXAIJReallocateAIJ_structure_only(A, a->mbs, bs2, nrow, row, col, rmax, ai, aj, rp, imax, nonew, MatScalar);
2202       } else {
2203         MatSeqXAIJReallocateAIJ(A, a->mbs, bs2, nrow, row, col, rmax, aa, ai, aj, rp, ap, imax, nonew, MatScalar);
2204       }
2205       N = nrow++ - 1;
2206       high++;
2207       /* shift up all the later entries in this row */
2208       PetscCall(PetscArraymove(rp + i + 1, rp + i, N - i + 1));
2209       rp[i] = col;
2210       if (!A->structure_only) {
2211         PetscCall(PetscArraymove(ap + bs2 * (i + 1), ap + bs2 * i, bs2 * (N - i + 1)));
2212         bap = ap + bs2 * i;
2213         if (roworiented) {
2214           for (ii = 0; ii < bs; ii++, value += stepval) {
2215             for (jj = ii; jj < bs2; jj += bs) bap[jj] = *value++;
2216           }
2217         } else {
2218           for (ii = 0; ii < bs; ii++, value += stepval) {
2219             for (jj = 0; jj < bs; jj++) *bap++ = *value++;
2220           }
2221         }
2222       }
2223     noinsert2:;
2224       low = i;
2225     }
2226     ailen[row] = nrow;
2227   }
2228   PetscFunctionReturn(0);
2229 }
2230 
2231 PetscErrorCode MatAssemblyEnd_SeqBAIJ(Mat A, MatAssemblyType mode)
2232 {
2233   Mat_SeqBAIJ *a      = (Mat_SeqBAIJ *)A->data;
2234   PetscInt     fshift = 0, i, *ai = a->i, *aj = a->j, *imax = a->imax;
2235   PetscInt     m = A->rmap->N, *ip, N, *ailen = a->ilen;
2236   PetscInt     mbs = a->mbs, bs2 = a->bs2, rmax = 0;
2237   MatScalar   *aa    = a->a, *ap;
2238   PetscReal    ratio = 0.6;
2239 
2240   PetscFunctionBegin;
2241   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
2242 
2243   if (m) rmax = ailen[0];
2244   for (i = 1; i < mbs; i++) {
2245     /* move each row back by the amount of empty slots (fshift) before it*/
2246     fshift += imax[i - 1] - ailen[i - 1];
2247     rmax = PetscMax(rmax, ailen[i]);
2248     if (fshift) {
2249       ip = aj + ai[i];
2250       ap = aa + bs2 * ai[i];
2251       N  = ailen[i];
2252       PetscCall(PetscArraymove(ip - fshift, ip, N));
2253       if (!A->structure_only) PetscCall(PetscArraymove(ap - bs2 * fshift, ap, bs2 * N));
2254     }
2255     ai[i] = ai[i - 1] + ailen[i - 1];
2256   }
2257   if (mbs) {
2258     fshift += imax[mbs - 1] - ailen[mbs - 1];
2259     ai[mbs] = ai[mbs - 1] + ailen[mbs - 1];
2260   }
2261 
2262   /* reset ilen and imax for each row */
2263   a->nonzerorowcnt = 0;
2264   if (A->structure_only) {
2265     PetscCall(PetscFree2(a->imax, a->ilen));
2266   } else { /* !A->structure_only */
2267     for (i = 0; i < mbs; i++) {
2268       ailen[i] = imax[i] = ai[i + 1] - ai[i];
2269       a->nonzerorowcnt += ((ai[i + 1] - ai[i]) > 0);
2270     }
2271   }
2272   a->nz = ai[mbs];
2273 
2274   /* diagonals may have moved, so kill the diagonal pointers */
2275   a->idiagvalid = PETSC_FALSE;
2276   if (fshift && a->diag) {
2277     PetscCall(PetscFree(a->diag));
2278     a->diag = NULL;
2279   }
2280   if (fshift) PetscCheck(a->nounused != -1, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unused space detected in matrix: %" PetscInt_FMT " X %" PetscInt_FMT " block size %" PetscInt_FMT ", %" PetscInt_FMT " unneeded", m, A->cmap->n, A->rmap->bs, fshift * bs2);
2281   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT ", block size %" PetscInt_FMT "; storage space: %" PetscInt_FMT " unneeded, %" PetscInt_FMT " used\n", m, A->cmap->n, A->rmap->bs, fshift * bs2, a->nz * bs2));
2282   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues is %" PetscInt_FMT "\n", a->reallocs));
2283   PetscCall(PetscInfo(A, "Most nonzeros blocks in any row is %" PetscInt_FMT "\n", rmax));
2284 
2285   A->info.mallocs += a->reallocs;
2286   a->reallocs         = 0;
2287   A->info.nz_unneeded = (PetscReal)fshift * bs2;
2288   a->rmax             = rmax;
2289 
2290   if (!A->structure_only) PetscCall(MatCheckCompressedRow(A, a->nonzerorowcnt, &a->compressedrow, a->i, mbs, ratio));
2291   PetscFunctionReturn(0);
2292 }
2293 
2294 /*
2295    This function returns an array of flags which indicate the locations of contiguous
2296    blocks that should be zeroed. for eg: if bs = 3  and is = [0,1,2,3,5,6,7,8,9]
2297    then the resulting sizes = [3,1,1,3,1] corresponding to sets [(0,1,2),(3),(5),(6,7,8),(9)]
2298    Assume: sizes should be long enough to hold all the values.
2299 */
2300 static PetscErrorCode MatZeroRows_SeqBAIJ_Check_Blocks(PetscInt idx[], PetscInt n, PetscInt bs, PetscInt sizes[], PetscInt *bs_max)
2301 {
2302   PetscInt  i, j, k, row;
2303   PetscBool flg;
2304 
2305   PetscFunctionBegin;
2306   for (i = 0, j = 0; i < n; j++) {
2307     row = idx[i];
2308     if (row % bs != 0) { /* Not the beginning of a block */
2309       sizes[j] = 1;
2310       i++;
2311     } else if (i + bs > n) { /* complete block doesn't exist (at idx end) */
2312       sizes[j] = 1;          /* Also makes sure at least 'bs' values exist for next else */
2313       i++;
2314     } else { /* Beginning of the block, so check if the complete block exists */
2315       flg = PETSC_TRUE;
2316       for (k = 1; k < bs; k++) {
2317         if (row + k != idx[i + k]) { /* break in the block */
2318           flg = PETSC_FALSE;
2319           break;
2320         }
2321       }
2322       if (flg) { /* No break in the bs */
2323         sizes[j] = bs;
2324         i += bs;
2325       } else {
2326         sizes[j] = 1;
2327         i++;
2328       }
2329     }
2330   }
2331   *bs_max = j;
2332   PetscFunctionReturn(0);
2333 }
2334 
2335 PetscErrorCode MatZeroRows_SeqBAIJ(Mat A, PetscInt is_n, const PetscInt is_idx[], PetscScalar diag, Vec x, Vec b)
2336 {
2337   Mat_SeqBAIJ       *baij = (Mat_SeqBAIJ *)A->data;
2338   PetscInt           i, j, k, count, *rows;
2339   PetscInt           bs = A->rmap->bs, bs2 = baij->bs2, *sizes, row, bs_max;
2340   PetscScalar        zero = 0.0;
2341   MatScalar         *aa;
2342   const PetscScalar *xx;
2343   PetscScalar       *bb;
2344 
2345   PetscFunctionBegin;
2346   /* fix right hand side if needed */
2347   if (x && b) {
2348     PetscCall(VecGetArrayRead(x, &xx));
2349     PetscCall(VecGetArray(b, &bb));
2350     for (i = 0; i < is_n; i++) bb[is_idx[i]] = diag * xx[is_idx[i]];
2351     PetscCall(VecRestoreArrayRead(x, &xx));
2352     PetscCall(VecRestoreArray(b, &bb));
2353   }
2354 
2355   /* Make a copy of the IS and  sort it */
2356   /* allocate memory for rows,sizes */
2357   PetscCall(PetscMalloc2(is_n, &rows, 2 * is_n, &sizes));
2358 
2359   /* copy IS values to rows, and sort them */
2360   for (i = 0; i < is_n; i++) rows[i] = is_idx[i];
2361   PetscCall(PetscSortInt(is_n, rows));
2362 
2363   if (baij->keepnonzeropattern) {
2364     for (i = 0; i < is_n; i++) sizes[i] = 1;
2365     bs_max = is_n;
2366   } else {
2367     PetscCall(MatZeroRows_SeqBAIJ_Check_Blocks(rows, is_n, bs, sizes, &bs_max));
2368     A->nonzerostate++;
2369   }
2370 
2371   for (i = 0, j = 0; i < bs_max; j += sizes[i], i++) {
2372     row = rows[j];
2373     PetscCheck(row >= 0 && row <= A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "row %" PetscInt_FMT " out of range", row);
2374     count = (baij->i[row / bs + 1] - baij->i[row / bs]) * bs;
2375     aa    = ((MatScalar *)(baij->a)) + baij->i[row / bs] * bs2 + (row % bs);
2376     if (sizes[i] == bs && !baij->keepnonzeropattern) {
2377       if (diag != (PetscScalar)0.0) {
2378         if (baij->ilen[row / bs] > 0) {
2379           baij->ilen[row / bs]       = 1;
2380           baij->j[baij->i[row / bs]] = row / bs;
2381 
2382           PetscCall(PetscArrayzero(aa, count * bs));
2383         }
2384         /* Now insert all the diagonal values for this bs */
2385         for (k = 0; k < bs; k++) PetscCall((*A->ops->setvalues)(A, 1, rows + j + k, 1, rows + j + k, &diag, INSERT_VALUES));
2386       } else { /* (diag == 0.0) */
2387         baij->ilen[row / bs] = 0;
2388       }      /* end (diag == 0.0) */
2389     } else { /* (sizes[i] != bs) */
2390       PetscAssert(sizes[i] == 1, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Internal Error. Value should be 1");
2391       for (k = 0; k < count; k++) {
2392         aa[0] = zero;
2393         aa += bs;
2394       }
2395       if (diag != (PetscScalar)0.0) PetscCall((*A->ops->setvalues)(A, 1, rows + j, 1, rows + j, &diag, INSERT_VALUES));
2396     }
2397   }
2398 
2399   PetscCall(PetscFree2(rows, sizes));
2400   PetscCall(MatAssemblyEnd_SeqBAIJ(A, MAT_FINAL_ASSEMBLY));
2401   PetscFunctionReturn(0);
2402 }
2403 
2404 PetscErrorCode MatZeroRowsColumns_SeqBAIJ(Mat A, PetscInt is_n, const PetscInt is_idx[], PetscScalar diag, Vec x, Vec b)
2405 {
2406   Mat_SeqBAIJ       *baij = (Mat_SeqBAIJ *)A->data;
2407   PetscInt           i, j, k, count;
2408   PetscInt           bs = A->rmap->bs, bs2 = baij->bs2, row, col;
2409   PetscScalar        zero = 0.0;
2410   MatScalar         *aa;
2411   const PetscScalar *xx;
2412   PetscScalar       *bb;
2413   PetscBool         *zeroed, vecs = PETSC_FALSE;
2414 
2415   PetscFunctionBegin;
2416   /* fix right hand side if needed */
2417   if (x && b) {
2418     PetscCall(VecGetArrayRead(x, &xx));
2419     PetscCall(VecGetArray(b, &bb));
2420     vecs = PETSC_TRUE;
2421   }
2422 
2423   /* zero the columns */
2424   PetscCall(PetscCalloc1(A->rmap->n, &zeroed));
2425   for (i = 0; i < is_n; i++) {
2426     PetscCheck(is_idx[i] >= 0 && is_idx[i] < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "row %" PetscInt_FMT " out of range", is_idx[i]);
2427     zeroed[is_idx[i]] = PETSC_TRUE;
2428   }
2429   for (i = 0; i < A->rmap->N; i++) {
2430     if (!zeroed[i]) {
2431       row = i / bs;
2432       for (j = baij->i[row]; j < baij->i[row + 1]; j++) {
2433         for (k = 0; k < bs; k++) {
2434           col = bs * baij->j[j] + k;
2435           if (zeroed[col]) {
2436             aa = ((MatScalar *)(baij->a)) + j * bs2 + (i % bs) + bs * k;
2437             if (vecs) bb[i] -= aa[0] * xx[col];
2438             aa[0] = 0.0;
2439           }
2440         }
2441       }
2442     } else if (vecs) bb[i] = diag * xx[i];
2443   }
2444   PetscCall(PetscFree(zeroed));
2445   if (vecs) {
2446     PetscCall(VecRestoreArrayRead(x, &xx));
2447     PetscCall(VecRestoreArray(b, &bb));
2448   }
2449 
2450   /* zero the rows */
2451   for (i = 0; i < is_n; i++) {
2452     row   = is_idx[i];
2453     count = (baij->i[row / bs + 1] - baij->i[row / bs]) * bs;
2454     aa    = ((MatScalar *)(baij->a)) + baij->i[row / bs] * bs2 + (row % bs);
2455     for (k = 0; k < count; k++) {
2456       aa[0] = zero;
2457       aa += bs;
2458     }
2459     if (diag != (PetscScalar)0.0) PetscUseTypeMethod(A, setvalues, 1, &row, 1, &row, &diag, INSERT_VALUES);
2460   }
2461   PetscCall(MatAssemblyEnd_SeqBAIJ(A, MAT_FINAL_ASSEMBLY));
2462   PetscFunctionReturn(0);
2463 }
2464 
2465 PetscErrorCode MatSetValues_SeqBAIJ(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
2466 {
2467   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2468   PetscInt    *rp, k, low, high, t, ii, row, nrow, i, col, l, rmax, N, lastcol = -1;
2469   PetscInt    *imax = a->imax, *ai = a->i, *ailen = a->ilen;
2470   PetscInt    *aj = a->j, nonew = a->nonew, bs = A->rmap->bs, brow, bcol;
2471   PetscInt     ridx, cidx, bs2                 = a->bs2;
2472   PetscBool    roworiented = a->roworiented;
2473   MatScalar   *ap = NULL, value = 0.0, *aa = a->a, *bap;
2474 
2475   PetscFunctionBegin;
2476   for (k = 0; k < m; k++) { /* loop over added rows */
2477     row  = im[k];
2478     brow = row / bs;
2479     if (row < 0) continue;
2480     PetscCheck(row < A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->N - 1);
2481     rp = aj + ai[brow];
2482     if (!A->structure_only) ap = aa + bs2 * ai[brow];
2483     rmax = imax[brow];
2484     nrow = ailen[brow];
2485     low  = 0;
2486     high = nrow;
2487     for (l = 0; l < n; l++) { /* loop over added columns */
2488       if (in[l] < 0) continue;
2489       PetscCheck(in[l] < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: col %" PetscInt_FMT " max %" PetscInt_FMT, in[l], A->cmap->n - 1);
2490       col  = in[l];
2491       bcol = col / bs;
2492       ridx = row % bs;
2493       cidx = col % bs;
2494       if (!A->structure_only) {
2495         if (roworiented) {
2496           value = v[l + k * n];
2497         } else {
2498           value = v[k + l * m];
2499         }
2500       }
2501       if (col <= lastcol) low = 0;
2502       else high = nrow;
2503       lastcol = col;
2504       while (high - low > 7) {
2505         t = (low + high) / 2;
2506         if (rp[t] > bcol) high = t;
2507         else low = t;
2508       }
2509       for (i = low; i < high; i++) {
2510         if (rp[i] > bcol) break;
2511         if (rp[i] == bcol) {
2512           bap = ap + bs2 * i + bs * cidx + ridx;
2513           if (!A->structure_only) {
2514             if (is == ADD_VALUES) *bap += value;
2515             else *bap = value;
2516           }
2517           goto noinsert1;
2518         }
2519       }
2520       if (nonew == 1) goto noinsert1;
2521       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
2522       if (A->structure_only) {
2523         MatSeqXAIJReallocateAIJ_structure_only(A, a->mbs, bs2, nrow, brow, bcol, rmax, ai, aj, rp, imax, nonew, MatScalar);
2524       } else {
2525         MatSeqXAIJReallocateAIJ(A, a->mbs, bs2, nrow, brow, bcol, rmax, aa, ai, aj, rp, ap, imax, nonew, MatScalar);
2526       }
2527       N = nrow++ - 1;
2528       high++;
2529       /* shift up all the later entries in this row */
2530       PetscCall(PetscArraymove(rp + i + 1, rp + i, N - i + 1));
2531       rp[i] = bcol;
2532       if (!A->structure_only) {
2533         PetscCall(PetscArraymove(ap + bs2 * (i + 1), ap + bs2 * i, bs2 * (N - i + 1)));
2534         PetscCall(PetscArrayzero(ap + bs2 * i, bs2));
2535         ap[bs2 * i + bs * cidx + ridx] = value;
2536       }
2537       a->nz++;
2538       A->nonzerostate++;
2539     noinsert1:;
2540       low = i;
2541     }
2542     ailen[brow] = nrow;
2543   }
2544   PetscFunctionReturn(0);
2545 }
2546 
2547 PetscErrorCode MatILUFactor_SeqBAIJ(Mat inA, IS row, IS col, const MatFactorInfo *info)
2548 {
2549   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
2550   Mat          outA;
2551   PetscBool    row_identity, col_identity;
2552 
2553   PetscFunctionBegin;
2554   PetscCheck(info->levels == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only levels = 0 supported for in-place ILU");
2555   PetscCall(ISIdentity(row, &row_identity));
2556   PetscCall(ISIdentity(col, &col_identity));
2557   PetscCheck(row_identity && col_identity, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column permutations must be identity for in-place ILU");
2558 
2559   outA            = inA;
2560   inA->factortype = MAT_FACTOR_LU;
2561   PetscCall(PetscFree(inA->solvertype));
2562   PetscCall(PetscStrallocpy(MATSOLVERPETSC, &inA->solvertype));
2563 
2564   PetscCall(MatMarkDiagonal_SeqBAIJ(inA));
2565 
2566   PetscCall(PetscObjectReference((PetscObject)row));
2567   PetscCall(ISDestroy(&a->row));
2568   a->row = row;
2569   PetscCall(PetscObjectReference((PetscObject)col));
2570   PetscCall(ISDestroy(&a->col));
2571   a->col = col;
2572 
2573   /* Create the invert permutation so that it can be used in MatLUFactorNumeric() */
2574   PetscCall(ISDestroy(&a->icol));
2575   PetscCall(ISInvertPermutation(col, PETSC_DECIDE, &a->icol));
2576 
2577   PetscCall(MatSeqBAIJSetNumericFactorization_inplace(inA, (PetscBool)(row_identity && col_identity)));
2578   if (!a->solve_work) { PetscCall(PetscMalloc1(inA->rmap->N + inA->rmap->bs, &a->solve_work)); }
2579   PetscCall(MatLUFactorNumeric(outA, inA, info));
2580   PetscFunctionReturn(0);
2581 }
2582 
2583 PetscErrorCode MatSeqBAIJSetColumnIndices_SeqBAIJ(Mat mat, PetscInt *indices)
2584 {
2585   Mat_SeqBAIJ *baij = (Mat_SeqBAIJ *)mat->data;
2586   PetscInt     i, nz, mbs;
2587 
2588   PetscFunctionBegin;
2589   nz  = baij->maxnz;
2590   mbs = baij->mbs;
2591   for (i = 0; i < nz; i++) baij->j[i] = indices[i];
2592   baij->nz = nz;
2593   for (i = 0; i < mbs; i++) baij->ilen[i] = baij->imax[i];
2594   PetscFunctionReturn(0);
2595 }
2596 
2597 /*@
2598     MatSeqBAIJSetColumnIndices - Set the column indices for all the rows in the matrix.
2599 
2600   Input Parameters:
2601 +  mat - the `MATSEQBAIJ` matrix
2602 -  indices - the column indices
2603 
2604   Level: advanced
2605 
2606   Notes:
2607     This can be called if you have precomputed the nonzero structure of the
2608   matrix and want to provide it to the matrix object to improve the performance
2609   of the `MatSetValues()` operation.
2610 
2611     You MUST have set the correct numbers of nonzeros per row in the call to
2612   `MatCreateSeqBAIJ()`, and the columns indices MUST be sorted.
2613 
2614     MUST be called before any calls to `MatSetValues()`
2615 
2616 .seealso: `MATSEQBAIJ`, `MatSetValues()`
2617 @*/
2618 PetscErrorCode MatSeqBAIJSetColumnIndices(Mat mat, PetscInt *indices)
2619 {
2620   PetscFunctionBegin;
2621   PetscValidHeaderSpecific(mat, MAT_CLASSID, 1);
2622   PetscValidIntPointer(indices, 2);
2623   PetscUseMethod(mat, "MatSeqBAIJSetColumnIndices_C", (Mat, PetscInt *), (mat, indices));
2624   PetscFunctionReturn(0);
2625 }
2626 
2627 PetscErrorCode MatGetRowMaxAbs_SeqBAIJ(Mat A, Vec v, PetscInt idx[])
2628 {
2629   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2630   PetscInt     i, j, n, row, bs, *ai, *aj, mbs;
2631   PetscReal    atmp;
2632   PetscScalar *x, zero = 0.0;
2633   MatScalar   *aa;
2634   PetscInt     ncols, brow, krow, kcol;
2635 
2636   PetscFunctionBegin;
2637   /* why is this not a macro???????????????????????????????????????????????????????????????? */
2638   PetscCheck(!A->factortype, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
2639   bs  = A->rmap->bs;
2640   aa  = a->a;
2641   ai  = a->i;
2642   aj  = a->j;
2643   mbs = a->mbs;
2644 
2645   PetscCall(VecSet(v, zero));
2646   PetscCall(VecGetArray(v, &x));
2647   PetscCall(VecGetLocalSize(v, &n));
2648   PetscCheck(n == A->rmap->N, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
2649   for (i = 0; i < mbs; i++) {
2650     ncols = ai[1] - ai[0];
2651     ai++;
2652     brow = bs * i;
2653     for (j = 0; j < ncols; j++) {
2654       for (kcol = 0; kcol < bs; kcol++) {
2655         for (krow = 0; krow < bs; krow++) {
2656           atmp = PetscAbsScalar(*aa);
2657           aa++;
2658           row = brow + krow; /* row index */
2659           if (PetscAbsScalar(x[row]) < atmp) {
2660             x[row] = atmp;
2661             if (idx) idx[row] = bs * (*aj) + kcol;
2662           }
2663         }
2664       }
2665       aj++;
2666     }
2667   }
2668   PetscCall(VecRestoreArray(v, &x));
2669   PetscFunctionReturn(0);
2670 }
2671 
2672 PetscErrorCode MatCopy_SeqBAIJ(Mat A, Mat B, MatStructure str)
2673 {
2674   PetscFunctionBegin;
2675   /* If the two matrices have the same copy implementation, use fast copy. */
2676   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
2677     Mat_SeqBAIJ *a    = (Mat_SeqBAIJ *)A->data;
2678     Mat_SeqBAIJ *b    = (Mat_SeqBAIJ *)B->data;
2679     PetscInt     ambs = a->mbs, bmbs = b->mbs, abs = A->rmap->bs, bbs = B->rmap->bs, bs2 = abs * abs;
2680 
2681     PetscCheck(a->i[ambs] == b->i[bmbs], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzero blocks in matrices A %" PetscInt_FMT " and B %" PetscInt_FMT " are different", a->i[ambs], b->i[bmbs]);
2682     PetscCheck(abs == bbs, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Block size A %" PetscInt_FMT " and B %" PetscInt_FMT " are different", abs, bbs);
2683     PetscCall(PetscArraycpy(b->a, a->a, bs2 * a->i[ambs]));
2684     PetscCall(PetscObjectStateIncrease((PetscObject)B));
2685   } else {
2686     PetscCall(MatCopy_Basic(A, B, str));
2687   }
2688   PetscFunctionReturn(0);
2689 }
2690 
2691 PetscErrorCode MatSetUp_SeqBAIJ(Mat A)
2692 {
2693   PetscFunctionBegin;
2694   PetscCall(MatSeqBAIJSetPreallocation(A, A->rmap->bs, PETSC_DEFAULT, NULL));
2695   PetscFunctionReturn(0);
2696 }
2697 
2698 static PetscErrorCode MatSeqBAIJGetArray_SeqBAIJ(Mat A, PetscScalar *array[])
2699 {
2700   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2701 
2702   PetscFunctionBegin;
2703   *array = a->a;
2704   PetscFunctionReturn(0);
2705 }
2706 
2707 static PetscErrorCode MatSeqBAIJRestoreArray_SeqBAIJ(Mat A, PetscScalar *array[])
2708 {
2709   PetscFunctionBegin;
2710   *array = NULL;
2711   PetscFunctionReturn(0);
2712 }
2713 
2714 PetscErrorCode MatAXPYGetPreallocation_SeqBAIJ(Mat Y, Mat X, PetscInt *nnz)
2715 {
2716   PetscInt     bs = Y->rmap->bs, mbs = Y->rmap->N / bs;
2717   Mat_SeqBAIJ *x = (Mat_SeqBAIJ *)X->data;
2718   Mat_SeqBAIJ *y = (Mat_SeqBAIJ *)Y->data;
2719 
2720   PetscFunctionBegin;
2721   /* Set the number of nonzeros in the new matrix */
2722   PetscCall(MatAXPYGetPreallocation_SeqX_private(mbs, x->i, x->j, y->i, y->j, nnz));
2723   PetscFunctionReturn(0);
2724 }
2725 
2726 PetscErrorCode MatAXPY_SeqBAIJ(Mat Y, PetscScalar a, Mat X, MatStructure str)
2727 {
2728   Mat_SeqBAIJ *x = (Mat_SeqBAIJ *)X->data, *y = (Mat_SeqBAIJ *)Y->data;
2729   PetscInt     bs = Y->rmap->bs, bs2 = bs * bs;
2730   PetscBLASInt one = 1;
2731 
2732   PetscFunctionBegin;
2733   if (str == UNKNOWN_NONZERO_PATTERN || (PetscDefined(USE_DEBUG) && str == SAME_NONZERO_PATTERN)) {
2734     PetscBool e = x->nz == y->nz && x->mbs == y->mbs && bs == X->rmap->bs ? PETSC_TRUE : PETSC_FALSE;
2735     if (e) {
2736       PetscCall(PetscArraycmp(x->i, y->i, x->mbs + 1, &e));
2737       if (e) {
2738         PetscCall(PetscArraycmp(x->j, y->j, x->i[x->mbs], &e));
2739         if (e) str = SAME_NONZERO_PATTERN;
2740       }
2741     }
2742     if (!e) PetscCheck(str != SAME_NONZERO_PATTERN, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "MatStructure is not SAME_NONZERO_PATTERN");
2743   }
2744   if (str == SAME_NONZERO_PATTERN) {
2745     PetscScalar  alpha = a;
2746     PetscBLASInt bnz;
2747     PetscCall(PetscBLASIntCast(x->nz * bs2, &bnz));
2748     PetscCallBLAS("BLASaxpy", BLASaxpy_(&bnz, &alpha, x->a, &one, y->a, &one));
2749     PetscCall(PetscObjectStateIncrease((PetscObject)Y));
2750   } else if (str == SUBSET_NONZERO_PATTERN) { /* nonzeros of X is a subset of Y's */
2751     PetscCall(MatAXPY_Basic(Y, a, X, str));
2752   } else {
2753     Mat       B;
2754     PetscInt *nnz;
2755     PetscCheck(bs == X->rmap->bs, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrices must have same block size");
2756     PetscCall(PetscMalloc1(Y->rmap->N, &nnz));
2757     PetscCall(MatCreate(PetscObjectComm((PetscObject)Y), &B));
2758     PetscCall(PetscObjectSetName((PetscObject)B, ((PetscObject)Y)->name));
2759     PetscCall(MatSetSizes(B, Y->rmap->n, Y->cmap->n, Y->rmap->N, Y->cmap->N));
2760     PetscCall(MatSetBlockSizesFromMats(B, Y, Y));
2761     PetscCall(MatSetType(B, (MatType)((PetscObject)Y)->type_name));
2762     PetscCall(MatAXPYGetPreallocation_SeqBAIJ(Y, X, nnz));
2763     PetscCall(MatSeqBAIJSetPreallocation(B, bs, 0, nnz));
2764     PetscCall(MatAXPY_BasicWithPreallocation(B, Y, a, X, str));
2765     PetscCall(MatHeaderMerge(Y, &B));
2766     PetscCall(PetscFree(nnz));
2767   }
2768   PetscFunctionReturn(0);
2769 }
2770 
2771 PETSC_INTERN PetscErrorCode MatConjugate_SeqBAIJ(Mat A)
2772 {
2773 #if defined(PETSC_USE_COMPLEX)
2774   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2775   PetscInt     i, nz = a->bs2 * a->i[a->mbs];
2776   MatScalar   *aa = a->a;
2777 
2778   PetscFunctionBegin;
2779   for (i = 0; i < nz; i++) aa[i] = PetscConj(aa[i]);
2780 #else
2781   PetscFunctionBegin;
2782 #endif
2783   PetscFunctionReturn(0);
2784 }
2785 
2786 PetscErrorCode MatRealPart_SeqBAIJ(Mat A)
2787 {
2788   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2789   PetscInt     i, nz = a->bs2 * a->i[a->mbs];
2790   MatScalar   *aa = a->a;
2791 
2792   PetscFunctionBegin;
2793   for (i = 0; i < nz; i++) aa[i] = PetscRealPart(aa[i]);
2794   PetscFunctionReturn(0);
2795 }
2796 
2797 PetscErrorCode MatImaginaryPart_SeqBAIJ(Mat A)
2798 {
2799   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2800   PetscInt     i, nz = a->bs2 * a->i[a->mbs];
2801   MatScalar   *aa = a->a;
2802 
2803   PetscFunctionBegin;
2804   for (i = 0; i < nz; i++) aa[i] = PetscImaginaryPart(aa[i]);
2805   PetscFunctionReturn(0);
2806 }
2807 
2808 /*
2809     Code almost identical to MatGetColumnIJ_SeqAIJ() should share common code
2810 */
2811 PetscErrorCode MatGetColumnIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
2812 {
2813   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
2814   PetscInt     bs = A->rmap->bs, i, *collengths, *cia, *cja, n = A->cmap->n / bs, m = A->rmap->n / bs;
2815   PetscInt     nz = a->i[m], row, *jj, mr, col;
2816 
2817   PetscFunctionBegin;
2818   *nn = n;
2819   if (!ia) PetscFunctionReturn(0);
2820   PetscCheck(!symmetric, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not for BAIJ matrices");
2821   PetscCall(PetscCalloc1(n, &collengths));
2822   PetscCall(PetscMalloc1(n + 1, &cia));
2823   PetscCall(PetscMalloc1(nz, &cja));
2824   jj = a->j;
2825   for (i = 0; i < nz; i++) collengths[jj[i]]++;
2826   cia[0] = oshift;
2827   for (i = 0; i < n; i++) cia[i + 1] = cia[i] + collengths[i];
2828   PetscCall(PetscArrayzero(collengths, n));
2829   jj = a->j;
2830   for (row = 0; row < m; row++) {
2831     mr = a->i[row + 1] - a->i[row];
2832     for (i = 0; i < mr; i++) {
2833       col = *jj++;
2834 
2835       cja[cia[col] + collengths[col]++ - oshift] = row + oshift;
2836     }
2837   }
2838   PetscCall(PetscFree(collengths));
2839   *ia = cia;
2840   *ja = cja;
2841   PetscFunctionReturn(0);
2842 }
2843 
2844 PetscErrorCode MatRestoreColumnIJ_SeqBAIJ(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
2845 {
2846   PetscFunctionBegin;
2847   if (!ia) PetscFunctionReturn(0);
2848   PetscCall(PetscFree(*ia));
2849   PetscCall(PetscFree(*ja));
2850   PetscFunctionReturn(0);
2851 }
2852 
2853 /*
2854  MatGetColumnIJ_SeqBAIJ_Color() and MatRestoreColumnIJ_SeqBAIJ_Color() are customized from
2855  MatGetColumnIJ_SeqBAIJ() and MatRestoreColumnIJ_SeqBAIJ() by adding an output
2856  spidx[], index of a->a, to be used in MatTransposeColoringCreate() and MatFDColoringCreate()
2857  */
2858 PetscErrorCode MatGetColumnIJ_SeqBAIJ_Color(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *nn, const PetscInt *ia[], const PetscInt *ja[], PetscInt *spidx[], PetscBool *done)
2859 {
2860   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
2861   PetscInt     i, *collengths, *cia, *cja, n = a->nbs, m = a->mbs;
2862   PetscInt     nz = a->i[m], row, *jj, mr, col;
2863   PetscInt    *cspidx;
2864 
2865   PetscFunctionBegin;
2866   *nn = n;
2867   if (!ia) PetscFunctionReturn(0);
2868 
2869   PetscCall(PetscCalloc1(n, &collengths));
2870   PetscCall(PetscMalloc1(n + 1, &cia));
2871   PetscCall(PetscMalloc1(nz, &cja));
2872   PetscCall(PetscMalloc1(nz, &cspidx));
2873   jj = a->j;
2874   for (i = 0; i < nz; i++) collengths[jj[i]]++;
2875   cia[0] = oshift;
2876   for (i = 0; i < n; i++) cia[i + 1] = cia[i] + collengths[i];
2877   PetscCall(PetscArrayzero(collengths, n));
2878   jj = a->j;
2879   for (row = 0; row < m; row++) {
2880     mr = a->i[row + 1] - a->i[row];
2881     for (i = 0; i < mr; i++) {
2882       col                                         = *jj++;
2883       cspidx[cia[col] + collengths[col] - oshift] = a->i[row] + i; /* index of a->j */
2884       cja[cia[col] + collengths[col]++ - oshift]  = row + oshift;
2885     }
2886   }
2887   PetscCall(PetscFree(collengths));
2888   *ia    = cia;
2889   *ja    = cja;
2890   *spidx = cspidx;
2891   PetscFunctionReturn(0);
2892 }
2893 
2894 PetscErrorCode MatRestoreColumnIJ_SeqBAIJ_Color(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool inodecompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscInt *spidx[], PetscBool *done)
2895 {
2896   PetscFunctionBegin;
2897   PetscCall(MatRestoreColumnIJ_SeqBAIJ(A, oshift, symmetric, inodecompressed, n, ia, ja, done));
2898   PetscCall(PetscFree(*spidx));
2899   PetscFunctionReturn(0);
2900 }
2901 
2902 PetscErrorCode MatShift_SeqBAIJ(Mat Y, PetscScalar a)
2903 {
2904   Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)Y->data;
2905 
2906   PetscFunctionBegin;
2907   if (!Y->preallocated || !aij->nz) PetscCall(MatSeqBAIJSetPreallocation(Y, Y->rmap->bs, 1, NULL));
2908   PetscCall(MatShift_Basic(Y, a));
2909   PetscFunctionReturn(0);
2910 }
2911 
2912 /* -------------------------------------------------------------------*/
2913 static struct _MatOps MatOps_Values = {
2914   MatSetValues_SeqBAIJ,
2915   MatGetRow_SeqBAIJ,
2916   MatRestoreRow_SeqBAIJ,
2917   MatMult_SeqBAIJ_N,
2918   /* 4*/ MatMultAdd_SeqBAIJ_N,
2919   MatMultTranspose_SeqBAIJ,
2920   MatMultTransposeAdd_SeqBAIJ,
2921   NULL,
2922   NULL,
2923   NULL,
2924   /* 10*/ NULL,
2925   MatLUFactor_SeqBAIJ,
2926   NULL,
2927   NULL,
2928   MatTranspose_SeqBAIJ,
2929   /* 15*/ MatGetInfo_SeqBAIJ,
2930   MatEqual_SeqBAIJ,
2931   MatGetDiagonal_SeqBAIJ,
2932   MatDiagonalScale_SeqBAIJ,
2933   MatNorm_SeqBAIJ,
2934   /* 20*/ NULL,
2935   MatAssemblyEnd_SeqBAIJ,
2936   MatSetOption_SeqBAIJ,
2937   MatZeroEntries_SeqBAIJ,
2938   /* 24*/ MatZeroRows_SeqBAIJ,
2939   NULL,
2940   NULL,
2941   NULL,
2942   NULL,
2943   /* 29*/ MatSetUp_SeqBAIJ,
2944   NULL,
2945   NULL,
2946   NULL,
2947   NULL,
2948   /* 34*/ MatDuplicate_SeqBAIJ,
2949   NULL,
2950   NULL,
2951   MatILUFactor_SeqBAIJ,
2952   NULL,
2953   /* 39*/ MatAXPY_SeqBAIJ,
2954   MatCreateSubMatrices_SeqBAIJ,
2955   MatIncreaseOverlap_SeqBAIJ,
2956   MatGetValues_SeqBAIJ,
2957   MatCopy_SeqBAIJ,
2958   /* 44*/ NULL,
2959   MatScale_SeqBAIJ,
2960   MatShift_SeqBAIJ,
2961   NULL,
2962   MatZeroRowsColumns_SeqBAIJ,
2963   /* 49*/ NULL,
2964   MatGetRowIJ_SeqBAIJ,
2965   MatRestoreRowIJ_SeqBAIJ,
2966   MatGetColumnIJ_SeqBAIJ,
2967   MatRestoreColumnIJ_SeqBAIJ,
2968   /* 54*/ MatFDColoringCreate_SeqXAIJ,
2969   NULL,
2970   NULL,
2971   NULL,
2972   MatSetValuesBlocked_SeqBAIJ,
2973   /* 59*/ MatCreateSubMatrix_SeqBAIJ,
2974   MatDestroy_SeqBAIJ,
2975   MatView_SeqBAIJ,
2976   NULL,
2977   NULL,
2978   /* 64*/ NULL,
2979   NULL,
2980   NULL,
2981   NULL,
2982   NULL,
2983   /* 69*/ MatGetRowMaxAbs_SeqBAIJ,
2984   NULL,
2985   MatConvert_Basic,
2986   NULL,
2987   NULL,
2988   /* 74*/ NULL,
2989   MatFDColoringApply_BAIJ,
2990   NULL,
2991   NULL,
2992   NULL,
2993   /* 79*/ NULL,
2994   NULL,
2995   NULL,
2996   NULL,
2997   MatLoad_SeqBAIJ,
2998   /* 84*/ NULL,
2999   NULL,
3000   NULL,
3001   NULL,
3002   NULL,
3003   /* 89*/ NULL,
3004   NULL,
3005   NULL,
3006   NULL,
3007   NULL,
3008   /* 94*/ NULL,
3009   NULL,
3010   NULL,
3011   NULL,
3012   NULL,
3013   /* 99*/ NULL,
3014   NULL,
3015   NULL,
3016   MatConjugate_SeqBAIJ,
3017   NULL,
3018   /*104*/ NULL,
3019   MatRealPart_SeqBAIJ,
3020   MatImaginaryPart_SeqBAIJ,
3021   NULL,
3022   NULL,
3023   /*109*/ NULL,
3024   NULL,
3025   NULL,
3026   NULL,
3027   MatMissingDiagonal_SeqBAIJ,
3028   /*114*/ NULL,
3029   NULL,
3030   NULL,
3031   NULL,
3032   NULL,
3033   /*119*/ NULL,
3034   NULL,
3035   MatMultHermitianTranspose_SeqBAIJ,
3036   MatMultHermitianTransposeAdd_SeqBAIJ,
3037   NULL,
3038   /*124*/ NULL,
3039   MatGetColumnReductions_SeqBAIJ,
3040   MatInvertBlockDiagonal_SeqBAIJ,
3041   NULL,
3042   NULL,
3043   /*129*/ NULL,
3044   NULL,
3045   NULL,
3046   NULL,
3047   NULL,
3048   /*134*/ NULL,
3049   NULL,
3050   NULL,
3051   NULL,
3052   NULL,
3053   /*139*/ MatSetBlockSizes_Default,
3054   NULL,
3055   NULL,
3056   MatFDColoringSetUp_SeqXAIJ,
3057   NULL,
3058   /*144*/ MatCreateMPIMatConcatenateSeqMat_SeqBAIJ,
3059   MatDestroySubMatrices_SeqBAIJ,
3060   NULL,
3061   NULL,
3062   NULL,
3063   NULL,
3064   /*150*/ NULL,
3065 };
3066 
3067 PetscErrorCode MatStoreValues_SeqBAIJ(Mat mat)
3068 {
3069   Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)mat->data;
3070   PetscInt     nz  = aij->i[aij->mbs] * aij->bs2;
3071 
3072   PetscFunctionBegin;
3073   PetscCheck(aij->nonew == 1, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
3074 
3075   /* allocate space for values if not already there */
3076   if (!aij->saved_values) { PetscCall(PetscMalloc1(nz + 1, &aij->saved_values)); }
3077 
3078   /* copy values over */
3079   PetscCall(PetscArraycpy(aij->saved_values, aij->a, nz));
3080   PetscFunctionReturn(0);
3081 }
3082 
3083 PetscErrorCode MatRetrieveValues_SeqBAIJ(Mat mat)
3084 {
3085   Mat_SeqBAIJ *aij = (Mat_SeqBAIJ *)mat->data;
3086   PetscInt     nz  = aij->i[aij->mbs] * aij->bs2;
3087 
3088   PetscFunctionBegin;
3089   PetscCheck(aij->nonew == 1, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
3090   PetscCheck(aij->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
3091 
3092   /* copy values over */
3093   PetscCall(PetscArraycpy(aij->a, aij->saved_values, nz));
3094   PetscFunctionReturn(0);
3095 }
3096 
3097 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqAIJ(Mat, MatType, MatReuse, Mat *);
3098 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqSBAIJ(Mat, MatType, MatReuse, Mat *);
3099 
3100 PetscErrorCode MatSeqBAIJSetPreallocation_SeqBAIJ(Mat B, PetscInt bs, PetscInt nz, PetscInt *nnz)
3101 {
3102   Mat_SeqBAIJ *b;
3103   PetscInt     i, mbs, nbs, bs2;
3104   PetscBool    flg = PETSC_FALSE, skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
3105 
3106   PetscFunctionBegin;
3107   if (nz >= 0 || nnz) realalloc = PETSC_TRUE;
3108   if (nz == MAT_SKIP_ALLOCATION) {
3109     skipallocation = PETSC_TRUE;
3110     nz             = 0;
3111   }
3112 
3113   PetscCall(MatSetBlockSize(B, PetscAbs(bs)));
3114   PetscCall(PetscLayoutSetUp(B->rmap));
3115   PetscCall(PetscLayoutSetUp(B->cmap));
3116   PetscCall(PetscLayoutGetBlockSize(B->rmap, &bs));
3117 
3118   B->preallocated = PETSC_TRUE;
3119 
3120   mbs = B->rmap->n / bs;
3121   nbs = B->cmap->n / bs;
3122   bs2 = bs * bs;
3123 
3124   PetscCheck(mbs * bs == B->rmap->n && nbs * bs == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows %" PetscInt_FMT ", cols %" PetscInt_FMT " must be divisible by blocksize %" PetscInt_FMT, B->rmap->N, B->cmap->n, bs);
3125 
3126   if (nz == PETSC_DEFAULT || nz == PETSC_DECIDE) nz = 5;
3127   PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nz cannot be less than 0: value %" PetscInt_FMT, nz);
3128   if (nnz) {
3129     for (i = 0; i < mbs; i++) {
3130       PetscCheck(nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, nnz[i]);
3131       PetscCheck(nnz[i] <= nbs, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "nnz cannot be greater than block row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, nnz[i], nbs);
3132     }
3133   }
3134 
3135   b = (Mat_SeqBAIJ *)B->data;
3136   PetscOptionsBegin(PetscObjectComm((PetscObject)B), NULL, "Optimize options for SEQBAIJ matrix 2 ", "Mat");
3137   PetscCall(PetscOptionsBool("-mat_no_unroll", "Do not optimize for block size (slow)", NULL, flg, &flg, NULL));
3138   PetscOptionsEnd();
3139 
3140   if (!flg) {
3141     switch (bs) {
3142     case 1:
3143       B->ops->mult    = MatMult_SeqBAIJ_1;
3144       B->ops->multadd = MatMultAdd_SeqBAIJ_1;
3145       break;
3146     case 2:
3147       B->ops->mult    = MatMult_SeqBAIJ_2;
3148       B->ops->multadd = MatMultAdd_SeqBAIJ_2;
3149       break;
3150     case 3:
3151       B->ops->mult    = MatMult_SeqBAIJ_3;
3152       B->ops->multadd = MatMultAdd_SeqBAIJ_3;
3153       break;
3154     case 4:
3155       B->ops->mult    = MatMult_SeqBAIJ_4;
3156       B->ops->multadd = MatMultAdd_SeqBAIJ_4;
3157       break;
3158     case 5:
3159       B->ops->mult    = MatMult_SeqBAIJ_5;
3160       B->ops->multadd = MatMultAdd_SeqBAIJ_5;
3161       break;
3162     case 6:
3163       B->ops->mult    = MatMult_SeqBAIJ_6;
3164       B->ops->multadd = MatMultAdd_SeqBAIJ_6;
3165       break;
3166     case 7:
3167       B->ops->mult    = MatMult_SeqBAIJ_7;
3168       B->ops->multadd = MatMultAdd_SeqBAIJ_7;
3169       break;
3170     case 9: {
3171       PetscInt version = 1;
3172       PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL));
3173       switch (version) {
3174 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
3175       case 1:
3176         B->ops->mult    = MatMult_SeqBAIJ_9_AVX2;
3177         B->ops->multadd = MatMultAdd_SeqBAIJ_9_AVX2;
3178         PetscCall(PetscInfo((PetscObject)B, "Using AVX2 for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3179         break;
3180 #endif
3181       default:
3182         B->ops->mult    = MatMult_SeqBAIJ_N;
3183         B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3184         PetscCall(PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3185         break;
3186       }
3187       break;
3188     }
3189     case 11:
3190       B->ops->mult    = MatMult_SeqBAIJ_11;
3191       B->ops->multadd = MatMultAdd_SeqBAIJ_11;
3192       break;
3193     case 12: {
3194       PetscInt version = 1;
3195       PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL));
3196       switch (version) {
3197       case 1:
3198         B->ops->mult    = MatMult_SeqBAIJ_12_ver1;
3199         B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver1;
3200         PetscCall(PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3201         break;
3202       case 2:
3203         B->ops->mult    = MatMult_SeqBAIJ_12_ver2;
3204         B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver2;
3205         PetscCall(PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3206         break;
3207 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
3208       case 3:
3209         B->ops->mult    = MatMult_SeqBAIJ_12_AVX2;
3210         B->ops->multadd = MatMultAdd_SeqBAIJ_12_ver1;
3211         PetscCall(PetscInfo((PetscObject)B, "Using AVX2 for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3212         break;
3213 #endif
3214       default:
3215         B->ops->mult    = MatMult_SeqBAIJ_N;
3216         B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3217         PetscCall(PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3218         break;
3219       }
3220       break;
3221     }
3222     case 15: {
3223       PetscInt version = 1;
3224       PetscCall(PetscOptionsGetInt(NULL, ((PetscObject)B)->prefix, "-mat_baij_mult_version", &version, NULL));
3225       switch (version) {
3226       case 1:
3227         B->ops->mult = MatMult_SeqBAIJ_15_ver1;
3228         PetscCall(PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3229         break;
3230       case 2:
3231         B->ops->mult = MatMult_SeqBAIJ_15_ver2;
3232         PetscCall(PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3233         break;
3234       case 3:
3235         B->ops->mult = MatMult_SeqBAIJ_15_ver3;
3236         PetscCall(PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3237         break;
3238       case 4:
3239         B->ops->mult = MatMult_SeqBAIJ_15_ver4;
3240         PetscCall(PetscInfo((PetscObject)B, "Using version %" PetscInt_FMT " of MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", version, bs));
3241         break;
3242       default:
3243         B->ops->mult = MatMult_SeqBAIJ_N;
3244         PetscCall(PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3245         break;
3246       }
3247       B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3248       break;
3249     }
3250     default:
3251       B->ops->mult    = MatMult_SeqBAIJ_N;
3252       B->ops->multadd = MatMultAdd_SeqBAIJ_N;
3253       PetscCall(PetscInfo((PetscObject)B, "Using BLAS for MatMult for BAIJ for blocksize %" PetscInt_FMT "\n", bs));
3254       break;
3255     }
3256   }
3257   B->ops->sor = MatSOR_SeqBAIJ;
3258   b->mbs      = mbs;
3259   b->nbs      = nbs;
3260   if (!skipallocation) {
3261     if (!b->imax) {
3262       PetscCall(PetscMalloc2(mbs, &b->imax, mbs, &b->ilen));
3263 
3264       b->free_imax_ilen = PETSC_TRUE;
3265     }
3266     /* b->ilen will count nonzeros in each block row so far. */
3267     for (i = 0; i < mbs; i++) b->ilen[i] = 0;
3268     if (!nnz) {
3269       if (nz == PETSC_DEFAULT || nz == PETSC_DECIDE) nz = 5;
3270       else if (nz < 0) nz = 1;
3271       nz = PetscMin(nz, nbs);
3272       for (i = 0; i < mbs; i++) b->imax[i] = nz;
3273       PetscCall(PetscIntMultError(nz, mbs, &nz));
3274     } else {
3275       PetscInt64 nz64 = 0;
3276       for (i = 0; i < mbs; i++) {
3277         b->imax[i] = nnz[i];
3278         nz64 += nnz[i];
3279       }
3280       PetscCall(PetscIntCast(nz64, &nz));
3281     }
3282 
3283     /* allocate the matrix space */
3284     PetscCall(MatSeqXAIJFreeAIJ(B, &b->a, &b->j, &b->i));
3285     if (B->structure_only) {
3286       PetscCall(PetscMalloc1(nz, &b->j));
3287       PetscCall(PetscMalloc1(B->rmap->N + 1, &b->i));
3288     } else {
3289       PetscInt nzbs2 = 0;
3290       PetscCall(PetscIntMultError(nz, bs2, &nzbs2));
3291       PetscCall(PetscMalloc3(nzbs2, &b->a, nz, &b->j, B->rmap->N + 1, &b->i));
3292       PetscCall(PetscArrayzero(b->a, nz * bs2));
3293     }
3294     PetscCall(PetscArrayzero(b->j, nz));
3295 
3296     if (B->structure_only) {
3297       b->singlemalloc = PETSC_FALSE;
3298       b->free_a       = PETSC_FALSE;
3299     } else {
3300       b->singlemalloc = PETSC_TRUE;
3301       b->free_a       = PETSC_TRUE;
3302     }
3303     b->free_ij = PETSC_TRUE;
3304 
3305     b->i[0] = 0;
3306     for (i = 1; i < mbs + 1; i++) b->i[i] = b->i[i - 1] + b->imax[i - 1];
3307 
3308   } else {
3309     b->free_a  = PETSC_FALSE;
3310     b->free_ij = PETSC_FALSE;
3311   }
3312 
3313   b->bs2              = bs2;
3314   b->mbs              = mbs;
3315   b->nz               = 0;
3316   b->maxnz            = nz;
3317   B->info.nz_unneeded = (PetscReal)b->maxnz * bs2;
3318   B->was_assembled    = PETSC_FALSE;
3319   B->assembled        = PETSC_FALSE;
3320   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
3321   PetscFunctionReturn(0);
3322 }
3323 
3324 PetscErrorCode MatSeqBAIJSetPreallocationCSR_SeqBAIJ(Mat B, PetscInt bs, const PetscInt ii[], const PetscInt jj[], const PetscScalar V[])
3325 {
3326   PetscInt     i, m, nz, nz_max = 0, *nnz;
3327   PetscScalar *values      = NULL;
3328   PetscBool    roworiented = ((Mat_SeqBAIJ *)B->data)->roworiented;
3329 
3330   PetscFunctionBegin;
3331   PetscCheck(bs >= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Invalid block size specified, must be positive but it is %" PetscInt_FMT, bs);
3332   PetscCall(PetscLayoutSetBlockSize(B->rmap, bs));
3333   PetscCall(PetscLayoutSetBlockSize(B->cmap, bs));
3334   PetscCall(PetscLayoutSetUp(B->rmap));
3335   PetscCall(PetscLayoutSetUp(B->cmap));
3336   PetscCall(PetscLayoutGetBlockSize(B->rmap, &bs));
3337   m = B->rmap->n / bs;
3338 
3339   PetscCheck(ii[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "ii[0] must be 0 but it is %" PetscInt_FMT, ii[0]);
3340   PetscCall(PetscMalloc1(m + 1, &nnz));
3341   for (i = 0; i < m; i++) {
3342     nz = ii[i + 1] - ii[i];
3343     PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Local row %" PetscInt_FMT " has a negative number of columns %" PetscInt_FMT, i, nz);
3344     nz_max = PetscMax(nz_max, nz);
3345     nnz[i] = nz;
3346   }
3347   PetscCall(MatSeqBAIJSetPreallocation(B, bs, 0, nnz));
3348   PetscCall(PetscFree(nnz));
3349 
3350   values = (PetscScalar *)V;
3351   if (!values) PetscCall(PetscCalloc1(bs * bs * (nz_max + 1), &values));
3352   for (i = 0; i < m; i++) {
3353     PetscInt        ncols = ii[i + 1] - ii[i];
3354     const PetscInt *icols = jj + ii[i];
3355     if (bs == 1 || !roworiented) {
3356       const PetscScalar *svals = values + (V ? (bs * bs * ii[i]) : 0);
3357       PetscCall(MatSetValuesBlocked_SeqBAIJ(B, 1, &i, ncols, icols, svals, INSERT_VALUES));
3358     } else {
3359       PetscInt j;
3360       for (j = 0; j < ncols; j++) {
3361         const PetscScalar *svals = values + (V ? (bs * bs * (ii[i] + j)) : 0);
3362         PetscCall(MatSetValuesBlocked_SeqBAIJ(B, 1, &i, 1, &icols[j], svals, INSERT_VALUES));
3363       }
3364     }
3365   }
3366   if (!V) PetscCall(PetscFree(values));
3367   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
3368   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
3369   PetscCall(MatSetOption(B, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3370   PetscFunctionReturn(0);
3371 }
3372 
3373 /*@C
3374    MatSeqBAIJGetArray - gives read/write access to the array where the data for a `MATSEQBAIJ` matrix is stored
3375 
3376    Not Collective
3377 
3378    Input Parameter:
3379 .  mat - a `MATSEQBAIJ` matrix
3380 
3381    Output Parameter:
3382 .   array - pointer to the data
3383 
3384    Level: intermediate
3385 
3386 .seealso: `MATSEQBAIJ`, `MatSeqBAIJRestoreArray()`, `MatSeqAIJGetArray()`, `MatSeqAIJRestoreArray()`
3387 @*/
3388 PetscErrorCode MatSeqBAIJGetArray(Mat A, PetscScalar **array)
3389 {
3390   PetscFunctionBegin;
3391   PetscUseMethod(A, "MatSeqBAIJGetArray_C", (Mat, PetscScalar **), (A, array));
3392   PetscFunctionReturn(0);
3393 }
3394 
3395 /*@C
3396    MatSeqBAIJRestoreArray - returns access to the array where the data for a `MATSEQBAIJ` matrix is stored obtained by `MatSeqBAIJGetArray()`
3397 
3398    Not Collective
3399 
3400    Input Parameters:
3401 +  mat - a `MATSEQBAIJ` matrix
3402 -  array - pointer to the data
3403 
3404    Level: intermediate
3405 
3406 .seealso: `MatSeqBAIJGetArray()`, `MatSeqAIJGetArray()`, `MatSeqAIJRestoreArray()`
3407 @*/
3408 PetscErrorCode MatSeqBAIJRestoreArray(Mat A, PetscScalar **array)
3409 {
3410   PetscFunctionBegin;
3411   PetscUseMethod(A, "MatSeqBAIJRestoreArray_C", (Mat, PetscScalar **), (A, array));
3412   PetscFunctionReturn(0);
3413 }
3414 
3415 /*MC
3416    MATSEQBAIJ - MATSEQBAIJ = "seqbaij" - A matrix type to be used for sequential block sparse matrices, based on
3417    block sparse compressed row format.
3418 
3419    Options Database Keys:
3420 + -mat_type seqbaij - sets the matrix type to "seqbaij" during a call to MatSetFromOptions()
3421 - -mat_baij_mult_version version - indicate the version of the matrix-vector product to use (0 often indicates using BLAS)
3422 
3423    Level: beginner
3424 
3425    Notes:
3426     `MatSetOptions`(,`MAT_STRUCTURE_ONLY`,`PETSC_TRUE`) may be called for this matrix type. In this no
3427     space is allocated for the nonzero entries and any entries passed with `MatSetValues()` are ignored
3428 
3429    Run with -info to see what version of the matrix-vector product is being used
3430 
3431 .seealso: `MatCreateSeqBAIJ()`
3432 M*/
3433 
3434 PETSC_INTERN PetscErrorCode MatConvert_SeqBAIJ_SeqBSTRM(Mat, MatType, MatReuse, Mat *);
3435 
3436 PETSC_EXTERN PetscErrorCode MatCreate_SeqBAIJ(Mat B)
3437 {
3438   PetscMPIInt  size;
3439   Mat_SeqBAIJ *b;
3440 
3441   PetscFunctionBegin;
3442   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
3443   PetscCheck(size == 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Comm must be of size 1");
3444 
3445   PetscCall(PetscNew(&b));
3446   B->data = (void *)b;
3447   PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps)));
3448 
3449   b->row          = NULL;
3450   b->col          = NULL;
3451   b->icol         = NULL;
3452   b->reallocs     = 0;
3453   b->saved_values = NULL;
3454 
3455   b->roworiented        = PETSC_TRUE;
3456   b->nonew              = 0;
3457   b->diag               = NULL;
3458   B->spptr              = NULL;
3459   B->info.nz_unneeded   = (PetscReal)b->maxnz * b->bs2;
3460   b->keepnonzeropattern = PETSC_FALSE;
3461 
3462   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJGetArray_C", MatSeqBAIJGetArray_SeqBAIJ));
3463   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJRestoreArray_C", MatSeqBAIJRestoreArray_SeqBAIJ));
3464   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqBAIJ));
3465   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqBAIJ));
3466   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetColumnIndices_C", MatSeqBAIJSetColumnIndices_SeqBAIJ));
3467   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_seqaij_C", MatConvert_SeqBAIJ_SeqAIJ));
3468   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_seqsbaij_C", MatConvert_SeqBAIJ_SeqSBAIJ));
3469   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetPreallocation_C", MatSeqBAIJSetPreallocation_SeqBAIJ));
3470   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqBAIJSetPreallocationCSR_C", MatSeqBAIJSetPreallocationCSR_SeqBAIJ));
3471   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatIsTranspose_C", MatIsTranspose_SeqBAIJ));
3472 #if defined(PETSC_HAVE_HYPRE)
3473   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_hypre_C", MatConvert_AIJ_HYPRE));
3474 #endif
3475   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqbaij_is_C", MatConvert_XAIJ_IS));
3476   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQBAIJ));
3477   PetscFunctionReturn(0);
3478 }
3479 
3480 PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
3481 {
3482   Mat_SeqBAIJ *c = (Mat_SeqBAIJ *)C->data, *a = (Mat_SeqBAIJ *)A->data;
3483   PetscInt     i, mbs = a->mbs, nz = a->nz, bs2 = a->bs2;
3484 
3485   PetscFunctionBegin;
3486   PetscCheck(a->i[mbs] == nz, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Corrupt matrix");
3487 
3488   if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3489     c->imax           = a->imax;
3490     c->ilen           = a->ilen;
3491     c->free_imax_ilen = PETSC_FALSE;
3492   } else {
3493     PetscCall(PetscMalloc2(mbs, &c->imax, mbs, &c->ilen));
3494     for (i = 0; i < mbs; i++) {
3495       c->imax[i] = a->imax[i];
3496       c->ilen[i] = a->ilen[i];
3497     }
3498     c->free_imax_ilen = PETSC_TRUE;
3499   }
3500 
3501   /* allocate the matrix space */
3502   if (mallocmatspace) {
3503     if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3504       PetscCall(PetscCalloc1(bs2 * nz, &c->a));
3505 
3506       c->i            = a->i;
3507       c->j            = a->j;
3508       c->singlemalloc = PETSC_FALSE;
3509       c->free_a       = PETSC_TRUE;
3510       c->free_ij      = PETSC_FALSE;
3511       c->parent       = A;
3512       C->preallocated = PETSC_TRUE;
3513       C->assembled    = PETSC_TRUE;
3514 
3515       PetscCall(PetscObjectReference((PetscObject)A));
3516       PetscCall(MatSetOption(A, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3517       PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE));
3518     } else {
3519       PetscCall(PetscMalloc3(bs2 * nz, &c->a, nz, &c->j, mbs + 1, &c->i));
3520 
3521       c->singlemalloc = PETSC_TRUE;
3522       c->free_a       = PETSC_TRUE;
3523       c->free_ij      = PETSC_TRUE;
3524 
3525       PetscCall(PetscArraycpy(c->i, a->i, mbs + 1));
3526       if (mbs > 0) {
3527         PetscCall(PetscArraycpy(c->j, a->j, nz));
3528         if (cpvalues == MAT_COPY_VALUES) {
3529           PetscCall(PetscArraycpy(c->a, a->a, bs2 * nz));
3530         } else {
3531           PetscCall(PetscArrayzero(c->a, bs2 * nz));
3532         }
3533       }
3534       C->preallocated = PETSC_TRUE;
3535       C->assembled    = PETSC_TRUE;
3536     }
3537   }
3538 
3539   c->roworiented = a->roworiented;
3540   c->nonew       = a->nonew;
3541 
3542   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
3543   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
3544 
3545   c->bs2 = a->bs2;
3546   c->mbs = a->mbs;
3547   c->nbs = a->nbs;
3548 
3549   if (a->diag) {
3550     if (cpvalues == MAT_SHARE_NONZERO_PATTERN) {
3551       c->diag      = a->diag;
3552       c->free_diag = PETSC_FALSE;
3553     } else {
3554       PetscCall(PetscMalloc1(mbs + 1, &c->diag));
3555       for (i = 0; i < mbs; i++) c->diag[i] = a->diag[i];
3556       c->free_diag = PETSC_TRUE;
3557     }
3558   } else c->diag = NULL;
3559 
3560   c->nz         = a->nz;
3561   c->maxnz      = a->nz; /* Since we allocate exactly the right amount */
3562   c->solve_work = NULL;
3563   c->mult_work  = NULL;
3564   c->sor_workt  = NULL;
3565   c->sor_work   = NULL;
3566 
3567   c->compressedrow.use   = a->compressedrow.use;
3568   c->compressedrow.nrows = a->compressedrow.nrows;
3569   if (a->compressedrow.use) {
3570     i = a->compressedrow.nrows;
3571     PetscCall(PetscMalloc2(i + 1, &c->compressedrow.i, i + 1, &c->compressedrow.rindex));
3572     PetscCall(PetscArraycpy(c->compressedrow.i, a->compressedrow.i, i + 1));
3573     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, i));
3574   } else {
3575     c->compressedrow.use    = PETSC_FALSE;
3576     c->compressedrow.i      = NULL;
3577     c->compressedrow.rindex = NULL;
3578   }
3579   C->nonzerostate = A->nonzerostate;
3580 
3581   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
3582   PetscFunctionReturn(0);
3583 }
3584 
3585 PetscErrorCode MatDuplicate_SeqBAIJ(Mat A, MatDuplicateOption cpvalues, Mat *B)
3586 {
3587   PetscFunctionBegin;
3588   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
3589   PetscCall(MatSetSizes(*B, A->rmap->N, A->cmap->n, A->rmap->N, A->cmap->n));
3590   PetscCall(MatSetType(*B, MATSEQBAIJ));
3591   PetscCall(MatDuplicateNoCreate_SeqBAIJ(*B, A, cpvalues, PETSC_TRUE));
3592   PetscFunctionReturn(0);
3593 }
3594 
3595 /* Used for both SeqBAIJ and SeqSBAIJ matrices */
3596 PetscErrorCode MatLoad_SeqBAIJ_Binary(Mat mat, PetscViewer viewer)
3597 {
3598   PetscInt     header[4], M, N, nz, bs, m, n, mbs, nbs, rows, cols, sum, i, j, k;
3599   PetscInt    *rowidxs, *colidxs;
3600   PetscScalar *matvals;
3601 
3602   PetscFunctionBegin;
3603   PetscCall(PetscViewerSetUp(viewer));
3604 
3605   /* read matrix header */
3606   PetscCall(PetscViewerBinaryRead(viewer, header, 4, NULL, PETSC_INT));
3607   PetscCheck(header[0] == MAT_FILE_CLASSID, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Not a matrix object in file");
3608   M  = header[1];
3609   N  = header[2];
3610   nz = header[3];
3611   PetscCheck(M >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix row size (%" PetscInt_FMT ") in file is negative", M);
3612   PetscCheck(N >= 0, PetscObjectComm((PetscObject)viewer), PETSC_ERR_FILE_UNEXPECTED, "Matrix column size (%" PetscInt_FMT ") in file is negative", N);
3613   PetscCheck(nz >= 0, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix stored in special format on disk, cannot load as SeqBAIJ");
3614 
3615   /* set block sizes from the viewer's .info file */
3616   PetscCall(MatLoad_Binary_BlockSizes(mat, viewer));
3617   /* set local and global sizes if not set already */
3618   if (mat->rmap->n < 0) mat->rmap->n = M;
3619   if (mat->cmap->n < 0) mat->cmap->n = N;
3620   if (mat->rmap->N < 0) mat->rmap->N = M;
3621   if (mat->cmap->N < 0) mat->cmap->N = N;
3622   PetscCall(PetscLayoutSetUp(mat->rmap));
3623   PetscCall(PetscLayoutSetUp(mat->cmap));
3624 
3625   /* check if the matrix sizes are correct */
3626   PetscCall(MatGetSize(mat, &rows, &cols));
3627   PetscCheck(M == rows && N == cols, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Matrix in file of different sizes (%" PetscInt_FMT ", %" PetscInt_FMT ") than the input matrix (%" PetscInt_FMT ", %" PetscInt_FMT ")", M, N, rows, cols);
3628   PetscCall(MatGetBlockSize(mat, &bs));
3629   PetscCall(MatGetLocalSize(mat, &m, &n));
3630   mbs = m / bs;
3631   nbs = n / bs;
3632 
3633   /* read in row lengths, column indices and nonzero values */
3634   PetscCall(PetscMalloc1(m + 1, &rowidxs));
3635   PetscCall(PetscViewerBinaryRead(viewer, rowidxs + 1, m, NULL, PETSC_INT));
3636   rowidxs[0] = 0;
3637   for (i = 0; i < m; i++) rowidxs[i + 1] += rowidxs[i];
3638   sum = rowidxs[m];
3639   PetscCheck(sum == nz, PETSC_COMM_SELF, PETSC_ERR_FILE_UNEXPECTED, "Inconsistent matrix data in file: nonzeros = %" PetscInt_FMT ", sum-row-lengths = %" PetscInt_FMT, nz, sum);
3640 
3641   /* read in column indices and nonzero values */
3642   PetscCall(PetscMalloc2(rowidxs[m], &colidxs, nz, &matvals));
3643   PetscCall(PetscViewerBinaryRead(viewer, colidxs, rowidxs[m], NULL, PETSC_INT));
3644   PetscCall(PetscViewerBinaryRead(viewer, matvals, rowidxs[m], NULL, PETSC_SCALAR));
3645 
3646   {               /* preallocate matrix storage */
3647     PetscBT   bt; /* helper bit set to count nonzeros */
3648     PetscInt *nnz;
3649     PetscBool sbaij;
3650 
3651     PetscCall(PetscBTCreate(nbs, &bt));
3652     PetscCall(PetscCalloc1(mbs, &nnz));
3653     PetscCall(PetscObjectTypeCompare((PetscObject)mat, MATSEQSBAIJ, &sbaij));
3654     for (i = 0; i < mbs; i++) {
3655       PetscCall(PetscBTMemzero(nbs, bt));
3656       for (k = 0; k < bs; k++) {
3657         PetscInt row = bs * i + k;
3658         for (j = rowidxs[row]; j < rowidxs[row + 1]; j++) {
3659           PetscInt col = colidxs[j];
3660           if (!sbaij || col >= row)
3661             if (!PetscBTLookupSet(bt, col / bs)) nnz[i]++;
3662         }
3663       }
3664     }
3665     PetscCall(PetscBTDestroy(&bt));
3666     PetscCall(MatSeqBAIJSetPreallocation(mat, bs, 0, nnz));
3667     PetscCall(MatSeqSBAIJSetPreallocation(mat, bs, 0, nnz));
3668     PetscCall(PetscFree(nnz));
3669   }
3670 
3671   /* store matrix values */
3672   for (i = 0; i < m; i++) {
3673     PetscInt row = i, s = rowidxs[i], e = rowidxs[i + 1];
3674     PetscCall((*mat->ops->setvalues)(mat, 1, &row, e - s, colidxs + s, matvals + s, INSERT_VALUES));
3675   }
3676 
3677   PetscCall(PetscFree(rowidxs));
3678   PetscCall(PetscFree2(colidxs, matvals));
3679   PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
3680   PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
3681   PetscFunctionReturn(0);
3682 }
3683 
3684 PetscErrorCode MatLoad_SeqBAIJ(Mat mat, PetscViewer viewer)
3685 {
3686   PetscBool isbinary;
3687 
3688   PetscFunctionBegin;
3689   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
3690   PetscCheck(isbinary, PetscObjectComm((PetscObject)viewer), PETSC_ERR_SUP, "Viewer type %s not yet supported for reading %s matrices", ((PetscObject)viewer)->type_name, ((PetscObject)mat)->type_name);
3691   PetscCall(MatLoad_SeqBAIJ_Binary(mat, viewer));
3692   PetscFunctionReturn(0);
3693 }
3694 
3695 /*@C
3696    MatCreateSeqBAIJ - Creates a sparse matrix in `MATSEQAIJ` (block
3697    compressed row) format.  For good matrix assembly performance the
3698    user should preallocate the matrix storage by setting the parameter nz
3699    (or the array nnz).  By setting these parameters accurately, performance
3700    during matrix assembly can be increased by more than a factor of 50.
3701 
3702    Collective
3703 
3704    Input Parameters:
3705 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
3706 .  bs - size of block, the blocks are ALWAYS square. One can use `MatSetBlockSizes()` to set a different row and column blocksize but the row
3707           blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with `MatCreateVecs()`
3708 .  m - number of rows
3709 .  n - number of columns
3710 .  nz - number of nonzero blocks  per block row (same for all rows)
3711 -  nnz - array containing the number of nonzero blocks in the various block rows
3712          (possibly different for each block row) or NULL
3713 
3714    Output Parameter:
3715 .  A - the matrix
3716 
3717    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
3718    MatXXXXSetPreallocation() paradigm instead of this routine directly.
3719    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
3720 
3721    Options Database Keys:
3722 +   -mat_no_unroll - uses code that does not unroll the loops in the block calculations (much slower)
3723 -   -mat_block_size - size of the blocks to use
3724 
3725    Level: intermediate
3726 
3727    Notes:
3728    The number of rows and columns must be divisible by blocksize.
3729 
3730    If the nnz parameter is given then the nz parameter is ignored
3731 
3732    A nonzero block is any block that as 1 or more nonzeros in it
3733 
3734    The `MATSEQBAIJ` format is fully compatible with standard Fortran 77
3735    storage.  That is, the stored row and column indices can begin at
3736    either one (as in Fortran) or zero.  See the users' manual for details.
3737 
3738    Specify the preallocated storage with either nz or nnz (not both).
3739    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
3740    allocation.  See [Sparse Matrices](sec_matsparse) for details.
3741    matrices.
3742 
3743 .seealso: [Sparse Matrices](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateBAIJ()`
3744 @*/
3745 PetscErrorCode MatCreateSeqBAIJ(MPI_Comm comm, PetscInt bs, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3746 {
3747   PetscFunctionBegin;
3748   PetscCall(MatCreate(comm, A));
3749   PetscCall(MatSetSizes(*A, m, n, m, n));
3750   PetscCall(MatSetType(*A, MATSEQBAIJ));
3751   PetscCall(MatSeqBAIJSetPreallocation(*A, bs, nz, (PetscInt *)nnz));
3752   PetscFunctionReturn(0);
3753 }
3754 
3755 /*@C
3756    MatSeqBAIJSetPreallocation - Sets the block size and expected nonzeros
3757    per row in the matrix. For good matrix assembly performance the
3758    user should preallocate the matrix storage by setting the parameter nz
3759    (or the array nnz).  By setting these parameters accurately, performance
3760    during matrix assembly can be increased by more than a factor of 50.
3761 
3762    Collective
3763 
3764    Input Parameters:
3765 +  B - the matrix
3766 .  bs - size of block, the blocks are ALWAYS square. One can use `MatSetBlockSizes()` to set a different row and column blocksize but the row
3767           blocksize always defines the size of the blocks. The column blocksize sets the blocksize of the vectors obtained with `MatCreateVecs()`
3768 .  nz - number of block nonzeros per block row (same for all rows)
3769 -  nnz - array containing the number of block nonzeros in the various block rows
3770          (possibly different for each block row) or NULL
3771 
3772    Options Database Keys:
3773 +   -mat_no_unroll - uses code that does not unroll the loops in the block calculations (much slower)
3774 -   -mat_block_size - size of the blocks to use
3775 
3776    Level: intermediate
3777 
3778    Notes:
3779    If the nnz parameter is given then the nz parameter is ignored
3780 
3781    You can call `MatGetInfo()` to get information on how effective the preallocation was;
3782    for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
3783    You can also run with the option -info and look for messages with the string
3784    malloc in them to see if additional memory allocation was needed.
3785 
3786    The `MATSEQBAIJ` format is fully compatible with standard Fortran 77
3787    storage.  That is, the stored row and column indices can begin at
3788    either one (as in Fortran) or zero.  See the users' manual for details.
3789 
3790    Specify the preallocated storage with either nz or nnz (not both).
3791    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
3792    allocation.  See [Sparse Matrices](sec_matsparse) for details.
3793 
3794 .seealso: [Sparse Matrices](sec_matsparse), `MatCreate()`, `MatCreateSeqAIJ()`, `MatSetValues()`, `MatCreateBAIJ()`, `MatGetInfo()`
3795 @*/
3796 PetscErrorCode MatSeqBAIJSetPreallocation(Mat B, PetscInt bs, PetscInt nz, const PetscInt nnz[])
3797 {
3798   PetscFunctionBegin;
3799   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
3800   PetscValidType(B, 1);
3801   PetscValidLogicalCollectiveInt(B, bs, 2);
3802   PetscTryMethod(B, "MatSeqBAIJSetPreallocation_C", (Mat, PetscInt, PetscInt, const PetscInt[]), (B, bs, nz, nnz));
3803   PetscFunctionReturn(0);
3804 }
3805 
3806 /*@C
3807    MatSeqBAIJSetPreallocationCSR - Creates a sparse sequential matrix in `MATSEQBAIJ` format using the given nonzero structure and (optional) numerical values
3808 
3809    Collective
3810 
3811    Input Parameters:
3812 +  B - the matrix
3813 .  i - the indices into j for the start of each local row (starts with zero)
3814 .  j - the column indices for each local row (starts with zero) these must be sorted for each row
3815 -  v - optional values in the matrix
3816 
3817    Level: advanced
3818 
3819    Notes:
3820    The order of the entries in values is specified by the `MatOption` `MAT_ROW_ORIENTED`.  For example, C programs
3821    may want to use the default `MAT_ROW_ORIENTED` of `PETSC_TRUE` and use an array v[nnz][bs][bs] where the second index is
3822    over rows within a block and the last index is over columns within a block row.  Fortran programs will likely set
3823    `MAT_ROW_ORIENTED` of `PETSC_FALSE` and use a Fortran array v(bs,bs,nnz) in which the first index is over rows within a
3824    block column and the second index is over columns within a block.
3825 
3826    Though this routine has Preallocation() in the name it also sets the exact nonzero locations of the matrix entries and usually the numerical values as well
3827 
3828 .seealso: `MatCreate()`, `MatCreateSeqBAIJ()`, `MatSetValues()`, `MatSeqBAIJSetPreallocation()`, `MATSEQBAIJ`
3829 @*/
3830 PetscErrorCode MatSeqBAIJSetPreallocationCSR(Mat B, PetscInt bs, const PetscInt i[], const PetscInt j[], const PetscScalar v[])
3831 {
3832   PetscFunctionBegin;
3833   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
3834   PetscValidType(B, 1);
3835   PetscValidLogicalCollectiveInt(B, bs, 2);
3836   PetscTryMethod(B, "MatSeqBAIJSetPreallocationCSR_C", (Mat, PetscInt, const PetscInt[], const PetscInt[], const PetscScalar[]), (B, bs, i, j, v));
3837   PetscFunctionReturn(0);
3838 }
3839 
3840 /*@
3841      MatCreateSeqBAIJWithArrays - Creates a `MATSEQBAIJ` matrix using matrix elements provided by the user.
3842 
3843      Collective
3844 
3845    Input Parameters:
3846 +  comm - must be an MPI communicator of size 1
3847 .  bs - size of block
3848 .  m - number of rows
3849 .  n - number of columns
3850 .  i - row indices; that is i[0] = 0, i[row] = i[row-1] + number of elements in that row block row of the matrix
3851 .  j - column indices
3852 -  a - matrix values
3853 
3854    Output Parameter:
3855 .  mat - the matrix
3856 
3857    Level: advanced
3858 
3859    Notes:
3860        The i, j, and a arrays are not copied by this routine, the user must free these arrays
3861     once the matrix is destroyed
3862 
3863        You cannot set new nonzero locations into this matrix, that will generate an error.
3864 
3865        The i and j indices are 0 based
3866 
3867        When block size is greater than 1 the matrix values must be stored using the `MATSEQBAIJ` storage format
3868 
3869       The order of the entries in values is the same as the block compressed sparse row storage format; that is, it is
3870       the same as a three dimensional array in Fortran values(bs,bs,nnz) that contains the first column of the first
3871       block, followed by the second column of the first block etc etc.  That is, the blocks are contiguous in memory
3872       with column-major ordering within blocks.
3873 
3874 .seealso: `MatCreate()`, `MatCreateBAIJ()`, `MatCreateSeqBAIJ()`
3875 @*/
3876 PetscErrorCode MatCreateSeqBAIJWithArrays(MPI_Comm comm, PetscInt bs, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], Mat *mat)
3877 {
3878   PetscInt     ii;
3879   Mat_SeqBAIJ *baij;
3880 
3881   PetscFunctionBegin;
3882   PetscCheck(bs == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "block size %" PetscInt_FMT " > 1 is not supported yet", bs);
3883   if (m > 0) PetscCheck(i[0] == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0");
3884 
3885   PetscCall(MatCreate(comm, mat));
3886   PetscCall(MatSetSizes(*mat, m, n, m, n));
3887   PetscCall(MatSetType(*mat, MATSEQBAIJ));
3888   PetscCall(MatSeqBAIJSetPreallocation(*mat, bs, MAT_SKIP_ALLOCATION, NULL));
3889   baij = (Mat_SeqBAIJ *)(*mat)->data;
3890   PetscCall(PetscMalloc2(m, &baij->imax, m, &baij->ilen));
3891 
3892   baij->i = i;
3893   baij->j = j;
3894   baij->a = a;
3895 
3896   baij->singlemalloc = PETSC_FALSE;
3897   baij->nonew        = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
3898   baij->free_a       = PETSC_FALSE;
3899   baij->free_ij      = PETSC_FALSE;
3900 
3901   for (ii = 0; ii < m; ii++) {
3902     baij->ilen[ii] = baij->imax[ii] = i[ii + 1] - i[ii];
3903     PetscCheck(i[ii + 1] - i[ii] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative row length in i (row indices) row = %" PetscInt_FMT " length = %" PetscInt_FMT, ii, i[ii + 1] - i[ii]);
3904   }
3905   if (PetscDefined(USE_DEBUG)) {
3906     for (ii = 0; ii < baij->i[m]; ii++) {
3907       PetscCheck(j[ii] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Negative column index at location = %" PetscInt_FMT " index = %" PetscInt_FMT, ii, j[ii]);
3908       PetscCheck(j[ii] <= n - 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column index to large at location = %" PetscInt_FMT " index = %" PetscInt_FMT, ii, j[ii]);
3909     }
3910   }
3911 
3912   PetscCall(MatAssemblyBegin(*mat, MAT_FINAL_ASSEMBLY));
3913   PetscCall(MatAssemblyEnd(*mat, MAT_FINAL_ASSEMBLY));
3914   PetscFunctionReturn(0);
3915 }
3916 
3917 PetscErrorCode MatCreateMPIMatConcatenateSeqMat_SeqBAIJ(MPI_Comm comm, Mat inmat, PetscInt n, MatReuse scall, Mat *outmat)
3918 {
3919   PetscFunctionBegin;
3920   PetscCall(MatCreateMPIMatConcatenateSeqMat_MPIBAIJ(comm, inmat, n, scall, outmat));
3921   PetscFunctionReturn(0);
3922 }
3923