xref: /petsc/src/mat/impls/sell/seq/sell.c (revision e0f5bfbec699682fa3e8b8532b1176849ea4e12a)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited      = PETSC_FALSE;
10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14                                " year = 2018\n"
15                                "}\n";
16 
17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
18 
19 #include <immintrin.h>
20 
21 #if !defined(_MM_SCALE_8)
22 #define _MM_SCALE_8 8
23 #endif
24 
25 #if defined(__AVX512F__)
26 /* these do not work
27    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29   */
30 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31   /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32   vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
33   vec_vals = _mm512_loadu_pd(aval); \
34   vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35   vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36 #elif defined(__AVX2__) && defined(__FMA__)
37 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38   vec_vals = _mm256_loadu_pd(aval); \
39   vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40   vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41   vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42 #endif
43 #endif /* PETSC_HAVE_IMMINTRIN_H */
44 
45 /*@C
46  MatSeqSELLSetPreallocation - For good matrix assembly performance
47  the user should preallocate the matrix storage by setting the parameter nz
48  (or the array nnz).  By setting these parameters accurately, performance
49  during matrix assembly can be increased significantly.
50 
51  Collective
52 
53  Input Parameters:
54  +  B - The `MATSEQSELL` matrix
55  .  nz - number of nonzeros per row (same for all rows)
56  -  nnz - array containing the number of nonzeros in the various rows
57  (possibly different for each row) or NULL
58 
59  Notes:
60  If nnz is given then nz is ignored.
61 
62  Specify the preallocated storage with either nz or nnz (not both).
63  Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
64  allocation.  For large problems you MUST preallocate memory or you
65  will get TERRIBLE performance, see the users' manual chapter on matrices.
66 
67  You can call `MatGetInfo()` to get information on how effective the preallocation was;
68  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
69  You can also run with the option -info and look for messages with the string
70  malloc in them to see if additional memory allocation was needed.
71 
72  Developers: Use nz of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
73  entries or columns indices.
74 
75  The maximum number of nonzeos in any row should be as accurate as possible.
76  If it is underestimated, you will get bad performance due to reallocation
77  (MatSeqXSELLReallocateSELL).
78 
79  Level: intermediate
80 
81  .seealso: `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
82  @*/
83 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) {
84   PetscFunctionBegin;
85   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
86   PetscValidType(B, 1);
87   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
88   PetscFunctionReturn(0);
89 }
90 
91 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) {
92   Mat_SeqSELL *b;
93   PetscInt     i, j, totalslices;
94   PetscBool    skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
95 
96   PetscFunctionBegin;
97   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
98   if (maxallocrow == MAT_SKIP_ALLOCATION) {
99     skipallocation = PETSC_TRUE;
100     maxallocrow    = 0;
101   }
102 
103   PetscCall(PetscLayoutSetUp(B->rmap));
104   PetscCall(PetscLayoutSetUp(B->cmap));
105 
106   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
107   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
108   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
109   if (rlen) {
110     for (i = 0; i < B->rmap->n; i++) {
111       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
112       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
113     }
114   }
115 
116   B->preallocated = PETSC_TRUE;
117 
118   b = (Mat_SeqSELL *)B->data;
119 
120   totalslices    = PetscCeilInt(B->rmap->n, 8);
121   b->totalslices = totalslices;
122   if (!skipallocation) {
123     if (B->rmap->n & 0x07) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n));
124 
125     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
126       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
127     }
128     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
129       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
130       else if (maxallocrow < 0) maxallocrow = 1;
131       for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow;
132     } else {
133       maxallocrow  = 0;
134       b->sliidx[0] = 0;
135       for (i = 1; i < totalslices; i++) {
136         b->sliidx[i] = 0;
137         for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]);
138         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
139         PetscCall(PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i]));
140       }
141       /* last slice */
142       b->sliidx[totalslices] = 0;
143       for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
144       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
145       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices];
146     }
147 
148     /* allocate space for val, colidx, rlen */
149     /* FIXME: should B's old memory be unlogged? */
150     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
151     /* FIXME: assuming an element of the bit array takes 8 bits */
152     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
153     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
154     PetscCall(PetscCalloc1(8 * totalslices, &b->rlen));
155 
156     b->singlemalloc = PETSC_TRUE;
157     b->free_val     = PETSC_TRUE;
158     b->free_colidx  = PETSC_TRUE;
159   } else {
160     b->free_val    = PETSC_FALSE;
161     b->free_colidx = PETSC_FALSE;
162   }
163 
164   b->nz               = 0;
165   b->maxallocrow      = maxallocrow;
166   b->rlenmax          = maxallocrow;
167   b->maxallocmat      = b->sliidx[totalslices];
168   B->info.nz_unneeded = (double)b->maxallocmat;
169   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
170   PetscFunctionReturn(0);
171 }
172 
173 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) {
174   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
175   PetscInt     shift;
176 
177   PetscFunctionBegin;
178   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
179   if (nz) *nz = a->rlen[row];
180   shift = a->sliidx[row >> 3] + (row & 0x07);
181   if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals));
182   if (idx) {
183     PetscInt j;
184     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j];
185     *idx = a->getrowcols;
186   }
187   if (v) {
188     PetscInt j;
189     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j];
190     *v = a->getrowvals;
191   }
192   PetscFunctionReturn(0);
193 }
194 
195 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) {
196   PetscFunctionBegin;
197   PetscFunctionReturn(0);
198 }
199 
200 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) {
201   Mat          B;
202   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
203   PetscInt     i;
204 
205   PetscFunctionBegin;
206   if (reuse == MAT_REUSE_MATRIX) {
207     B = *newmat;
208     PetscCall(MatZeroEntries(B));
209   } else {
210     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
211     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
212     PetscCall(MatSetType(B, MATSEQAIJ));
213     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
214   }
215 
216   for (i = 0; i < A->rmap->n; i++) {
217     PetscInt     nz = 0, *cols = NULL;
218     PetscScalar *vals = NULL;
219 
220     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
221     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
222     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
223   }
224 
225   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
226   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
227   B->rmap->bs = A->rmap->bs;
228 
229   if (reuse == MAT_INPLACE_MATRIX) {
230     PetscCall(MatHeaderReplace(A, &B));
231   } else {
232     *newmat = B;
233   }
234   PetscFunctionReturn(0);
235 }
236 
237 #include <../src/mat/impls/aij/seq/aij.h>
238 
239 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) {
240   Mat                B;
241   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
242   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
243   const PetscInt    *cols;
244   const PetscScalar *vals;
245 
246   PetscFunctionBegin;
247 
248   if (reuse == MAT_REUSE_MATRIX) {
249     B = *newmat;
250   } else {
251     if (PetscDefined(USE_DEBUG) || !a->ilen) {
252       PetscCall(PetscMalloc1(m, &rowlengths));
253       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
254     }
255     if (PetscDefined(USE_DEBUG) && a->ilen) {
256       PetscBool eq;
257       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
258       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
259       PetscCall(PetscFree(rowlengths));
260       rowlengths = a->ilen;
261     } else if (a->ilen) rowlengths = a->ilen;
262     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
263     PetscCall(MatSetSizes(B, m, n, m, n));
264     PetscCall(MatSetType(B, MATSEQSELL));
265     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
266     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
267   }
268 
269   for (row = 0; row < m; row++) {
270     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
271     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
272     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
273   }
274   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
275   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
276   B->rmap->bs = A->rmap->bs;
277 
278   if (reuse == MAT_INPLACE_MATRIX) {
279     PetscCall(MatHeaderReplace(A, &B));
280   } else {
281     *newmat = B;
282   }
283   PetscFunctionReturn(0);
284 }
285 
286 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) {
287   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
288   PetscScalar       *y;
289   const PetscScalar *x;
290   const MatScalar   *aval        = a->val;
291   PetscInt           totalslices = a->totalslices;
292   const PetscInt    *acolidx     = a->colidx;
293   PetscInt           i, j;
294 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
295   __m512d  vec_x, vec_y, vec_vals;
296   __m256i  vec_idx;
297   __mmask8 mask;
298   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
299   __m256i  vec_idx2, vec_idx3, vec_idx4;
300 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
301   __m128i   vec_idx;
302   __m256d   vec_x, vec_y, vec_y2, vec_vals;
303   MatScalar yval;
304   PetscInt  r, rows_left, row, nnz_in_row;
305 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
306   __m128d   vec_x_tmp;
307   __m256d   vec_x, vec_y, vec_y2, vec_vals;
308   MatScalar yval;
309   PetscInt  r, rows_left, row, nnz_in_row;
310 #else
311   PetscScalar sum[8];
312 #endif
313 
314 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
315 #pragma disjoint(*x, *y, *aval)
316 #endif
317 
318   PetscFunctionBegin;
319   PetscCall(VecGetArrayRead(xx, &x));
320   PetscCall(VecGetArray(yy, &y));
321 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
322   for (i = 0; i < totalslices; i++) { /* loop over slices */
323     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
324     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
325 
326     vec_y  = _mm512_setzero_pd();
327     vec_y2 = _mm512_setzero_pd();
328     vec_y3 = _mm512_setzero_pd();
329     vec_y4 = _mm512_setzero_pd();
330 
331     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice columnn */
332     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
333     case 3:
334       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
335       acolidx += 8;
336       aval += 8;
337       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
338       acolidx += 8;
339       aval += 8;
340       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
341       acolidx += 8;
342       aval += 8;
343       j += 3;
344       break;
345     case 2:
346       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
347       acolidx += 8;
348       aval += 8;
349       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
350       acolidx += 8;
351       aval += 8;
352       j += 2;
353       break;
354     case 1:
355       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
356       acolidx += 8;
357       aval += 8;
358       j += 1;
359       break;
360     }
361 #pragma novector
362     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
363       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
364       acolidx += 8;
365       aval += 8;
366       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
367       acolidx += 8;
368       aval += 8;
369       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
370       acolidx += 8;
371       aval += 8;
372       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
373       acolidx += 8;
374       aval += 8;
375     }
376 
377     vec_y = _mm512_add_pd(vec_y, vec_y2);
378     vec_y = _mm512_add_pd(vec_y, vec_y3);
379     vec_y = _mm512_add_pd(vec_y, vec_y4);
380     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
381       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
382       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
383     } else {
384       _mm512_storeu_pd(&y[8 * i], vec_y);
385     }
386   }
387 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
388   for (i = 0; i < totalslices; i++) { /* loop over full slices */
389     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
390     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
391 
392     /* last slice may have padding rows. Don't use vectorization. */
393     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
394       rows_left = A->rmap->n - 8 * i;
395       for (r = 0; r < rows_left; ++r) {
396         yval       = (MatScalar)0;
397         row        = 8 * i + r;
398         nnz_in_row = a->rlen[row];
399         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
400         y[row] = yval;
401       }
402       break;
403     }
404 
405     vec_y  = _mm256_setzero_pd();
406     vec_y2 = _mm256_setzero_pd();
407 
408 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
409 #pragma novector
410 #pragma unroll(2)
411     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
412       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
413       aval += 4;
414       acolidx += 4;
415       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
416       aval += 4;
417       acolidx += 4;
418     }
419 
420     _mm256_storeu_pd(y + i * 8, vec_y);
421     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
422   }
423 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
424   for (i = 0; i < totalslices; i++) { /* loop over full slices */
425     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
426     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
427 
428     vec_y  = _mm256_setzero_pd();
429     vec_y2 = _mm256_setzero_pd();
430 
431     /* last slice may have padding rows. Don't use vectorization. */
432     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
433       rows_left = A->rmap->n - 8 * i;
434       for (r = 0; r < rows_left; ++r) {
435         yval       = (MatScalar)0;
436         row        = 8 * i + r;
437         nnz_in_row = a->rlen[row];
438         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
439         y[row] = yval;
440       }
441       break;
442     }
443 
444 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
445 #pragma novector
446 #pragma unroll(2)
447     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
448       vec_vals  = _mm256_loadu_pd(aval);
449       vec_x_tmp = _mm_setzero_pd();
450       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
451       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
452       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
453       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
454       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
455       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
456       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
457       aval += 4;
458 
459       vec_vals  = _mm256_loadu_pd(aval);
460       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
461       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
462       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
463       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
464       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
465       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
466       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
467       aval += 4;
468     }
469 
470     _mm256_storeu_pd(y + i * 8, vec_y);
471     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
472   }
473 #else
474   for (i = 0; i < totalslices; i++) { /* loop over slices */
475     for (j = 0; j < 8; j++) sum[j] = 0.0;
476     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
477       sum[0] += aval[j] * x[acolidx[j]];
478       sum[1] += aval[j + 1] * x[acolidx[j + 1]];
479       sum[2] += aval[j + 2] * x[acolidx[j + 2]];
480       sum[3] += aval[j + 3] * x[acolidx[j + 3]];
481       sum[4] += aval[j + 4] * x[acolidx[j + 4]];
482       sum[5] += aval[j + 5] * x[acolidx[j + 5]];
483       sum[6] += aval[j + 6] * x[acolidx[j + 6]];
484       sum[7] += aval[j + 7] * x[acolidx[j + 7]];
485     }
486     if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
487       for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j];
488     } else {
489       for (j = 0; j < 8; j++) y[8 * i + j] = sum[j];
490     }
491   }
492 #endif
493 
494   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
495   PetscCall(VecRestoreArrayRead(xx, &x));
496   PetscCall(VecRestoreArray(yy, &y));
497   PetscFunctionReturn(0);
498 }
499 
500 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
501 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) {
502   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
503   PetscScalar       *y, *z;
504   const PetscScalar *x;
505   const MatScalar   *aval        = a->val;
506   PetscInt           totalslices = a->totalslices;
507   const PetscInt    *acolidx     = a->colidx;
508   PetscInt           i, j;
509 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
510   __m512d  vec_x, vec_y, vec_vals;
511   __m256i  vec_idx;
512   __mmask8 mask;
513   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
514   __m256i  vec_idx2, vec_idx3, vec_idx4;
515 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
516   __m128d   vec_x_tmp;
517   __m256d   vec_x, vec_y, vec_y2, vec_vals;
518   MatScalar yval;
519   PetscInt  r, row, nnz_in_row;
520 #else
521   PetscScalar sum[8];
522 #endif
523 
524 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
525 #pragma disjoint(*x, *y, *aval)
526 #endif
527 
528   PetscFunctionBegin;
529   PetscCall(VecGetArrayRead(xx, &x));
530   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
531 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
532   for (i = 0; i < totalslices; i++) { /* loop over slices */
533     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
534     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
535 
536     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
537       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
538       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
539     } else {
540       vec_y = _mm512_loadu_pd(&y[8 * i]);
541     }
542     vec_y2 = _mm512_setzero_pd();
543     vec_y3 = _mm512_setzero_pd();
544     vec_y4 = _mm512_setzero_pd();
545 
546     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice columnn */
547     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
548     case 3:
549       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
550       acolidx += 8;
551       aval += 8;
552       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
553       acolidx += 8;
554       aval += 8;
555       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
556       acolidx += 8;
557       aval += 8;
558       j += 3;
559       break;
560     case 2:
561       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
562       acolidx += 8;
563       aval += 8;
564       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
565       acolidx += 8;
566       aval += 8;
567       j += 2;
568       break;
569     case 1:
570       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
571       acolidx += 8;
572       aval += 8;
573       j += 1;
574       break;
575     }
576 #pragma novector
577     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
578       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
579       acolidx += 8;
580       aval += 8;
581       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
582       acolidx += 8;
583       aval += 8;
584       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
585       acolidx += 8;
586       aval += 8;
587       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
588       acolidx += 8;
589       aval += 8;
590     }
591 
592     vec_y = _mm512_add_pd(vec_y, vec_y2);
593     vec_y = _mm512_add_pd(vec_y, vec_y3);
594     vec_y = _mm512_add_pd(vec_y, vec_y4);
595     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
596       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
597     } else {
598       _mm512_storeu_pd(&z[8 * i], vec_y);
599     }
600   }
601 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
602   for (i = 0; i < totalslices; i++) { /* loop over full slices */
603     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
604     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
605 
606     /* last slice may have padding rows. Don't use vectorization. */
607     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
608       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
609         row        = 8 * i + r;
610         yval       = (MatScalar)0.0;
611         nnz_in_row = a->rlen[row];
612         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
613         z[row] = y[row] + yval;
614       }
615       break;
616     }
617 
618     vec_y  = _mm256_loadu_pd(y + 8 * i);
619     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
620 
621     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
622     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
623       vec_vals  = _mm256_loadu_pd(aval);
624       vec_x_tmp = _mm_setzero_pd();
625       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
626       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
627       vec_x     = _mm256_setzero_pd();
628       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
629       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
630       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
631       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
632       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
633       aval += 4;
634 
635       vec_vals  = _mm256_loadu_pd(aval);
636       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
637       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
638       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
639       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
640       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
641       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
642       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
643       aval += 4;
644     }
645 
646     _mm256_storeu_pd(z + i * 8, vec_y);
647     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
648   }
649 #else
650   for (i = 0; i < totalslices; i++) { /* loop over slices */
651     for (j = 0; j < 8; j++) sum[j] = 0.0;
652     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
653       sum[0] += aval[j] * x[acolidx[j]];
654       sum[1] += aval[j + 1] * x[acolidx[j + 1]];
655       sum[2] += aval[j + 2] * x[acolidx[j + 2]];
656       sum[3] += aval[j + 3] * x[acolidx[j + 3]];
657       sum[4] += aval[j + 4] * x[acolidx[j + 4]];
658       sum[5] += aval[j + 5] * x[acolidx[j + 5]];
659       sum[6] += aval[j + 6] * x[acolidx[j + 6]];
660       sum[7] += aval[j + 7] * x[acolidx[j + 7]];
661     }
662     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
663       for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j];
664     } else {
665       for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j];
666     }
667   }
668 #endif
669 
670   PetscCall(PetscLogFlops(2.0 * a->nz));
671   PetscCall(VecRestoreArrayRead(xx, &x));
672   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
673   PetscFunctionReturn(0);
674 }
675 
676 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) {
677   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
678   PetscScalar       *y;
679   const PetscScalar *x;
680   const MatScalar   *aval    = a->val;
681   const PetscInt    *acolidx = a->colidx;
682   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices;
683 
684 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
685 #pragma disjoint(*x, *y, *aval)
686 #endif
687 
688   PetscFunctionBegin;
689   if (A->symmetric == PETSC_BOOL3_TRUE) {
690     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
691     PetscFunctionReturn(0);
692   }
693   if (zz != yy) PetscCall(VecCopy(zz, yy));
694   PetscCall(VecGetArrayRead(xx, &x));
695   PetscCall(VecGetArray(yy, &y));
696   for (i = 0; i < a->totalslices; i++) { /* loop over slices */
697     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
698       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
699         row        = 8 * i + r;
700         nnz_in_row = a->rlen[row];
701         for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row];
702       }
703       break;
704     }
705     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
706       y[acolidx[j]] += aval[j] * x[8 * i];
707       y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1];
708       y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2];
709       y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3];
710       y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4];
711       y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5];
712       y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6];
713       y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7];
714     }
715   }
716   PetscCall(PetscLogFlops(2.0 * a->sliidx[a->totalslices]));
717   PetscCall(VecRestoreArrayRead(xx, &x));
718   PetscCall(VecRestoreArray(yy, &y));
719   PetscFunctionReturn(0);
720 }
721 
722 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) {
723   PetscFunctionBegin;
724   if (A->symmetric == PETSC_BOOL3_TRUE) {
725     PetscCall(MatMult_SeqSELL(A, xx, yy));
726   } else {
727     PetscCall(VecSet(yy, 0.0));
728     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
729   }
730   PetscFunctionReturn(0);
731 }
732 
733 /*
734      Checks for missing diagonals
735 */
736 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) {
737   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
738   PetscInt    *diag, i;
739 
740   PetscFunctionBegin;
741   *missing = PETSC_FALSE;
742   if (A->rmap->n > 0 && !(a->colidx)) {
743     *missing = PETSC_TRUE;
744     if (d) *d = 0;
745     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
746   } else {
747     diag = a->diag;
748     for (i = 0; i < A->rmap->n; i++) {
749       if (diag[i] == -1) {
750         *missing = PETSC_TRUE;
751         if (d) *d = i;
752         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
753         break;
754       }
755     }
756   }
757   PetscFunctionReturn(0);
758 }
759 
760 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) {
761   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
762   PetscInt     i, j, m = A->rmap->n, shift;
763 
764   PetscFunctionBegin;
765   if (!a->diag) {
766     PetscCall(PetscMalloc1(m, &a->diag));
767     a->free_diag = PETSC_TRUE;
768   }
769   for (i = 0; i < m; i++) {                      /* loop over rows */
770     shift      = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
771     a->diag[i] = -1;
772     for (j = 0; j < a->rlen[i]; j++) {
773       if (a->colidx[shift + j * 8] == i) {
774         a->diag[i] = shift + j * 8;
775         break;
776       }
777     }
778   }
779   PetscFunctionReturn(0);
780 }
781 
782 /*
783   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
784 */
785 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) {
786   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
787   PetscInt     i, *diag, m = A->rmap->n;
788   MatScalar   *val = a->val;
789   PetscScalar *idiag, *mdiag;
790 
791   PetscFunctionBegin;
792   if (a->idiagvalid) PetscFunctionReturn(0);
793   PetscCall(MatMarkDiagonal_SeqSELL(A));
794   diag = a->diag;
795   if (!a->idiag) {
796     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
797     val = a->val;
798   }
799   mdiag = a->mdiag;
800   idiag = a->idiag;
801 
802   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
803     for (i = 0; i < m; i++) {
804       mdiag[i] = val[diag[i]];
805       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
806         if (PetscRealPart(fshift)) {
807           PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
808           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
809           A->factorerror_zeropivot_value = 0.0;
810           A->factorerror_zeropivot_row   = i;
811         } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
812       }
813       idiag[i] = 1.0 / val[diag[i]];
814     }
815     PetscCall(PetscLogFlops(m));
816   } else {
817     for (i = 0; i < m; i++) {
818       mdiag[i] = val[diag[i]];
819       idiag[i] = omega / (fshift + val[diag[i]]);
820     }
821     PetscCall(PetscLogFlops(2.0 * m));
822   }
823   a->idiagvalid = PETSC_TRUE;
824   PetscFunctionReturn(0);
825 }
826 
827 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) {
828   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
829 
830   PetscFunctionBegin;
831   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
832   PetscCall(MatSeqSELLInvalidateDiagonal(A));
833   PetscFunctionReturn(0);
834 }
835 
836 PetscErrorCode MatDestroy_SeqSELL(Mat A) {
837   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
838 
839   PetscFunctionBegin;
840 #if defined(PETSC_USE_LOG)
841   PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz);
842 #endif
843   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
844   PetscCall(ISDestroy(&a->row));
845   PetscCall(ISDestroy(&a->col));
846   PetscCall(PetscFree(a->diag));
847   PetscCall(PetscFree(a->rlen));
848   PetscCall(PetscFree(a->sliidx));
849   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
850   PetscCall(PetscFree(a->solve_work));
851   PetscCall(ISDestroy(&a->icol));
852   PetscCall(PetscFree(a->saved_values));
853   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
854 
855   PetscCall(PetscFree(A->data));
856 
857   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
858   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
859   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
860   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
861   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
862   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
863   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
864   PetscFunctionReturn(0);
865 }
866 
867 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) {
868   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
869 
870   PetscFunctionBegin;
871   switch (op) {
872   case MAT_ROW_ORIENTED: a->roworiented = flg; break;
873   case MAT_KEEP_NONZERO_PATTERN: a->keepnonzeropattern = flg; break;
874   case MAT_NEW_NONZERO_LOCATIONS: a->nonew = (flg ? 0 : 1); break;
875   case MAT_NEW_NONZERO_LOCATION_ERR: a->nonew = (flg ? -1 : 0); break;
876   case MAT_NEW_NONZERO_ALLOCATION_ERR: a->nonew = (flg ? -2 : 0); break;
877   case MAT_UNUSED_NONZERO_LOCATION_ERR: a->nounused = (flg ? -1 : 0); break;
878   case MAT_FORCE_DIAGONAL_ENTRIES:
879   case MAT_IGNORE_OFF_PROC_ENTRIES:
880   case MAT_USE_HASH_TABLE:
881   case MAT_SORTED_FULL: PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op])); break;
882   case MAT_SPD:
883   case MAT_SYMMETRIC:
884   case MAT_STRUCTURALLY_SYMMETRIC:
885   case MAT_HERMITIAN:
886   case MAT_SYMMETRY_ETERNAL:
887   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
888   case MAT_SPD_ETERNAL:
889     /* These options are handled directly by MatSetOption() */
890     break;
891   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
892   }
893   PetscFunctionReturn(0);
894 }
895 
896 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) {
897   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
898   PetscInt     i, j, n, shift;
899   PetscScalar *x, zero = 0.0;
900 
901   PetscFunctionBegin;
902   PetscCall(VecGetLocalSize(v, &n));
903   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
904 
905   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
906     PetscInt *diag = a->diag;
907     PetscCall(VecGetArray(v, &x));
908     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
909     PetscCall(VecRestoreArray(v, &x));
910     PetscFunctionReturn(0);
911   }
912 
913   PetscCall(VecSet(v, zero));
914   PetscCall(VecGetArray(v, &x));
915   for (i = 0; i < n; i++) {                 /* loop over rows */
916     shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
917     x[i]  = 0;
918     for (j = 0; j < a->rlen[i]; j++) {
919       if (a->colidx[shift + j * 8] == i) {
920         x[i] = a->val[shift + j * 8];
921         break;
922       }
923     }
924   }
925   PetscCall(VecRestoreArray(v, &x));
926   PetscFunctionReturn(0);
927 }
928 
929 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) {
930   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
931   const PetscScalar *l, *r;
932   PetscInt           i, j, m, n, row;
933 
934   PetscFunctionBegin;
935   if (ll) {
936     /* The local size is used so that VecMPI can be passed to this routine
937        by MatDiagonalScale_MPISELL */
938     PetscCall(VecGetLocalSize(ll, &m));
939     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
940     PetscCall(VecGetArrayRead(ll, &l));
941     for (i = 0; i < a->totalslices; i++) {                  /* loop over slices */
942       if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
943         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
944           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row];
945         }
946       } else {
947         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row];
948       }
949     }
950     PetscCall(VecRestoreArrayRead(ll, &l));
951     PetscCall(PetscLogFlops(a->nz));
952   }
953   if (rr) {
954     PetscCall(VecGetLocalSize(rr, &n));
955     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
956     PetscCall(VecGetArrayRead(rr, &r));
957     for (i = 0; i < a->totalslices; i++) {                  /* loop over slices */
958       if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
959         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
960           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
961         }
962       } else {
963         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
964       }
965     }
966     PetscCall(VecRestoreArrayRead(rr, &r));
967     PetscCall(PetscLogFlops(a->nz));
968   }
969   PetscCall(MatSeqSELLInvalidateDiagonal(A));
970   PetscFunctionReturn(0);
971 }
972 
973 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) {
974   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
975   PetscInt    *cp, i, k, low, high, t, row, col, l;
976   PetscInt     shift;
977   MatScalar   *vp;
978 
979   PetscFunctionBegin;
980   for (k = 0; k < m; k++) { /* loop over requested rows */
981     row = im[k];
982     if (row < 0) continue;
983     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
984     shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
985     cp    = a->colidx + shift;                  /* pointer to the row */
986     vp    = a->val + shift;                     /* pointer to the row */
987     for (l = 0; l < n; l++) {                   /* loop over requested columns */
988       col = in[l];
989       if (col < 0) continue;
990       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
991       high = a->rlen[row];
992       low  = 0; /* assume unsorted */
993       while (high - low > 5) {
994         t = (low + high) / 2;
995         if (*(cp + t * 8) > col) high = t;
996         else low = t;
997       }
998       for (i = low; i < high; i++) {
999         if (*(cp + 8 * i) > col) break;
1000         if (*(cp + 8 * i) == col) {
1001           *v++ = *(vp + 8 * i);
1002           goto finished;
1003         }
1004       }
1005       *v++ = 0.0;
1006     finished:;
1007     }
1008   }
1009   PetscFunctionReturn(0);
1010 }
1011 
1012 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) {
1013   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1014   PetscInt          i, j, m = A->rmap->n, shift;
1015   const char       *name;
1016   PetscViewerFormat format;
1017 
1018   PetscFunctionBegin;
1019   PetscCall(PetscViewerGetFormat(viewer, &format));
1020   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1021     PetscInt nofinalvalue = 0;
1022     /*
1023     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1024       nofinalvalue = 1;
1025     }
1026     */
1027     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1028     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1029     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1030 #if defined(PETSC_USE_COMPLEX)
1031     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1032 #else
1033     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1034 #endif
1035     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1036 
1037     for (i = 0; i < m; i++) {
1038       shift = a->sliidx[i >> 3] + (i & 0x07);
1039       for (j = 0; j < a->rlen[i]; j++) {
1040 #if defined(PETSC_USE_COMPLEX)
1041         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1042 #else
1043         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j]));
1044 #endif
1045       }
1046     }
1047     /*
1048     if (nofinalvalue) {
1049 #if defined(PETSC_USE_COMPLEX)
1050       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1051 #else
1052       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1053 #endif
1054     }
1055     */
1056     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1057     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1058     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1059   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1060     PetscFunctionReturn(0);
1061   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1062     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1063     for (i = 0; i < m; i++) {
1064       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1065       shift = a->sliidx[i >> 3] + (i & 0x07);
1066       for (j = 0; j < a->rlen[i]; j++) {
1067 #if defined(PETSC_USE_COMPLEX)
1068         if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1069           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1070         } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1071           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1072         } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1073           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1074         }
1075 #else
1076         if (a->val[shift + 8 * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1077 #endif
1078       }
1079       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1080     }
1081     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1082   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1083     PetscInt    cnt = 0, jcnt;
1084     PetscScalar value;
1085 #if defined(PETSC_USE_COMPLEX)
1086     PetscBool realonly = PETSC_TRUE;
1087     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1088       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1089         realonly = PETSC_FALSE;
1090         break;
1091       }
1092     }
1093 #endif
1094 
1095     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1096     for (i = 0; i < m; i++) {
1097       jcnt  = 0;
1098       shift = a->sliidx[i >> 3] + (i & 0x07);
1099       for (j = 0; j < A->cmap->n; j++) {
1100         if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) {
1101           value = a->val[cnt++];
1102           jcnt++;
1103         } else {
1104           value = 0.0;
1105         }
1106 #if defined(PETSC_USE_COMPLEX)
1107         if (realonly) {
1108           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1109         } else {
1110           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1111         }
1112 #else
1113         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1114 #endif
1115       }
1116       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1117     }
1118     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1119   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1120     PetscInt fshift = 1;
1121     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1122 #if defined(PETSC_USE_COMPLEX)
1123     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1124 #else
1125     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1126 #endif
1127     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1128     for (i = 0; i < m; i++) {
1129       shift = a->sliidx[i >> 3] + (i & 0x07);
1130       for (j = 0; j < a->rlen[i]; j++) {
1131 #if defined(PETSC_USE_COMPLEX)
1132         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1133 #else
1134         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j]));
1135 #endif
1136       }
1137     }
1138     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1139   } else if (format == PETSC_VIEWER_NATIVE) {
1140     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1141       PetscInt row;
1142       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1143       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
1144 #if defined(PETSC_USE_COMPLEX)
1145         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1146           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1147         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1148           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1149         } else {
1150           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1151         }
1152 #else
1153         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j]));
1154 #endif
1155       }
1156     }
1157   } else {
1158     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1159     if (A->factortype) {
1160       for (i = 0; i < m; i++) {
1161         shift = a->sliidx[i >> 3] + (i & 0x07);
1162         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1163         /* L part */
1164         for (j = shift; j < a->diag[i]; j += 8) {
1165 #if defined(PETSC_USE_COMPLEX)
1166           if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) {
1167             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1168           } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) {
1169             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1170           } else {
1171             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1172           }
1173 #else
1174           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1175 #endif
1176         }
1177         /* diagonal */
1178         j = a->diag[i];
1179 #if defined(PETSC_USE_COMPLEX)
1180         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1181           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1182         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1183           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1184         } else {
1185           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1186         }
1187 #else
1188         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1189 #endif
1190 
1191         /* U part */
1192         for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) {
1193 #if defined(PETSC_USE_COMPLEX)
1194           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1195             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1196           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1197             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1198           } else {
1199             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1200           }
1201 #else
1202           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1203 #endif
1204         }
1205         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1206       }
1207     } else {
1208       for (i = 0; i < m; i++) {
1209         shift = a->sliidx[i >> 3] + (i & 0x07);
1210         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1211         for (j = 0; j < a->rlen[i]; j++) {
1212 #if defined(PETSC_USE_COMPLEX)
1213           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1214             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1215           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1216             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1217           } else {
1218             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1219           }
1220 #else
1221           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1222 #endif
1223         }
1224         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1225       }
1226     }
1227     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1228   }
1229   PetscCall(PetscViewerFlush(viewer));
1230   PetscFunctionReturn(0);
1231 }
1232 
1233 #include <petscdraw.h>
1234 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) {
1235   Mat               A = (Mat)Aa;
1236   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1237   PetscInt          i, j, m = A->rmap->n, shift;
1238   int               color;
1239   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1240   PetscViewer       viewer;
1241   PetscViewerFormat format;
1242 
1243   PetscFunctionBegin;
1244   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1245   PetscCall(PetscViewerGetFormat(viewer, &format));
1246   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1247 
1248   /* loop over matrix elements drawing boxes */
1249 
1250   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1251     PetscDrawCollectiveBegin(draw);
1252     /* Blue for negative, Cyan for zero and  Red for positive */
1253     color = PETSC_DRAW_BLUE;
1254     for (i = 0; i < m; i++) {
1255       shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1256       y_l   = m - i - 1.0;
1257       y_r   = y_l + 1.0;
1258       for (j = 0; j < a->rlen[i]; j++) {
1259         x_l = a->colidx[shift + j * 8];
1260         x_r = x_l + 1.0;
1261         if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue;
1262         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1263       }
1264     }
1265     color = PETSC_DRAW_CYAN;
1266     for (i = 0; i < m; i++) {
1267       shift = a->sliidx[i >> 3] + (i & 0x07);
1268       y_l   = m - i - 1.0;
1269       y_r   = y_l + 1.0;
1270       for (j = 0; j < a->rlen[i]; j++) {
1271         x_l = a->colidx[shift + j * 8];
1272         x_r = x_l + 1.0;
1273         if (a->val[shift + 8 * j] != 0.) continue;
1274         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1275       }
1276     }
1277     color = PETSC_DRAW_RED;
1278     for (i = 0; i < m; i++) {
1279       shift = a->sliidx[i >> 3] + (i & 0x07);
1280       y_l   = m - i - 1.0;
1281       y_r   = y_l + 1.0;
1282       for (j = 0; j < a->rlen[i]; j++) {
1283         x_l = a->colidx[shift + j * 8];
1284         x_r = x_l + 1.0;
1285         if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue;
1286         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1287       }
1288     }
1289     PetscDrawCollectiveEnd(draw);
1290   } else {
1291     /* use contour shading to indicate magnitude of values */
1292     /* first determine max of all nonzero values */
1293     PetscReal minv = 0.0, maxv = 0.0;
1294     PetscInt  count = 0;
1295     PetscDraw popup;
1296     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1297       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1298     }
1299     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1300     PetscCall(PetscDrawGetPopup(draw, &popup));
1301     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1302 
1303     PetscDrawCollectiveBegin(draw);
1304     for (i = 0; i < m; i++) {
1305       shift = a->sliidx[i >> 3] + (i & 0x07);
1306       y_l   = m - i - 1.0;
1307       y_r   = y_l + 1.0;
1308       for (j = 0; j < a->rlen[i]; j++) {
1309         x_l   = a->colidx[shift + j * 8];
1310         x_r   = x_l + 1.0;
1311         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1312         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1313         count++;
1314       }
1315     }
1316     PetscDrawCollectiveEnd(draw);
1317   }
1318   PetscFunctionReturn(0);
1319 }
1320 
1321 #include <petscdraw.h>
1322 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) {
1323   PetscDraw draw;
1324   PetscReal xr, yr, xl, yl, h, w;
1325   PetscBool isnull;
1326 
1327   PetscFunctionBegin;
1328   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1329   PetscCall(PetscDrawIsNull(draw, &isnull));
1330   if (isnull) PetscFunctionReturn(0);
1331 
1332   xr = A->cmap->n;
1333   yr = A->rmap->n;
1334   h  = yr / 10.0;
1335   w  = xr / 10.0;
1336   xr += w;
1337   yr += h;
1338   xl = -w;
1339   yl = -h;
1340   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1341   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1342   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1343   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1344   PetscCall(PetscDrawSave(draw));
1345   PetscFunctionReturn(0);
1346 }
1347 
1348 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) {
1349   PetscBool iascii, isbinary, isdraw;
1350 
1351   PetscFunctionBegin;
1352   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1353   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1354   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1355   if (iascii) {
1356     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1357   } else if (isbinary) {
1358     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1359   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1360   PetscFunctionReturn(0);
1361 }
1362 
1363 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) {
1364   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1365   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1366   MatScalar   *vp;
1367 
1368   PetscFunctionBegin;
1369   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1370   /* To do: compress out the unused elements */
1371   PetscCall(MatMarkDiagonal_SeqSELL(A));
1372   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1373   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1374   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1375   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1376   for (i = 0; i < a->totalslices; ++i) {
1377     shift = a->sliidx[i];                                      /* starting index of the slice */
1378     cp    = a->colidx + shift;                                 /* pointer to the column indices of the slice */
1379     vp    = a->val + shift;                                    /* pointer to the nonzero values of the slice */
1380     for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */
1381       row     = 8 * i + row_in_slice;
1382       nrow    = a->rlen[row]; /* number of nonzeros in row */
1383       /*
1384         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1385         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1386       */
1387       lastcol = 0;
1388       if (nrow > 0) {                                /* nonempty row */
1389         lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1390       } else if (!row_in_slice) {                    /* first row of the currect slice is empty */
1391         for (j = 1; j < 8; j++) {
1392           if (a->rlen[8 * i + j]) {
1393             lastcol = cp[j];
1394             break;
1395           }
1396         }
1397       } else {
1398         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1399       }
1400 
1401       for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) {
1402         cp[8 * k + row_in_slice] = lastcol;
1403         vp[8 * k + row_in_slice] = (MatScalar)0;
1404       }
1405     }
1406   }
1407 
1408   A->info.mallocs += a->reallocs;
1409   a->reallocs = 0;
1410 
1411   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1412   PetscFunctionReturn(0);
1413 }
1414 
1415 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) {
1416   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1417 
1418   PetscFunctionBegin;
1419   info->block_size   = 1.0;
1420   info->nz_allocated = a->maxallocmat;
1421   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1422   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1423   info->assemblies   = A->num_ass;
1424   info->mallocs      = A->info.mallocs;
1425   info->memory       = 0; /* REVIEW ME */
1426   if (A->factortype) {
1427     info->fill_ratio_given  = A->info.fill_ratio_given;
1428     info->fill_ratio_needed = A->info.fill_ratio_needed;
1429     info->factor_mallocs    = A->info.factor_mallocs;
1430   } else {
1431     info->fill_ratio_given  = 0;
1432     info->fill_ratio_needed = 0;
1433     info->factor_mallocs    = 0;
1434   }
1435   PetscFunctionReturn(0);
1436 }
1437 
1438 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) {
1439   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1440   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1441   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1442   MatScalar   *vp, value;
1443 
1444   PetscFunctionBegin;
1445   for (k = 0; k < m; k++) { /* loop over added rows */
1446     row = im[k];
1447     if (row < 0) continue;
1448     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1449     shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1450     cp    = a->colidx + shift;                  /* pointer to the row */
1451     vp    = a->val + shift;                     /* pointer to the row */
1452     nrow  = a->rlen[row];
1453     low   = 0;
1454     high  = nrow;
1455 
1456     for (l = 0; l < n; l++) { /* loop over added columns */
1457       col = in[l];
1458       if (col < 0) continue;
1459       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1460       if (a->roworiented) {
1461         value = v[l + k * n];
1462       } else {
1463         value = v[k + l * m];
1464       }
1465       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1466 
1467       /* search in this row for the specified column, i indicates the column to be set */
1468       if (col <= lastcol) low = 0;
1469       else high = nrow;
1470       lastcol = col;
1471       while (high - low > 5) {
1472         t = (low + high) / 2;
1473         if (*(cp + t * 8) > col) high = t;
1474         else low = t;
1475       }
1476       for (i = low; i < high; i++) {
1477         if (*(cp + i * 8) > col) break;
1478         if (*(cp + i * 8) == col) {
1479           if (is == ADD_VALUES) *(vp + i * 8) += value;
1480           else *(vp + i * 8) = value;
1481           low = i + 1;
1482           goto noinsert;
1483         }
1484       }
1485       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1486       if (nonew == 1) goto noinsert;
1487       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1488       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1489       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar);
1490       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1491       for (ii = nrow - 1; ii >= i; ii--) {
1492         *(cp + (ii + 1) * 8) = *(cp + ii * 8);
1493         *(vp + (ii + 1) * 8) = *(vp + ii * 8);
1494       }
1495       a->rlen[row]++;
1496       *(cp + i * 8) = col;
1497       *(vp + i * 8) = value;
1498       a->nz++;
1499       A->nonzerostate++;
1500       low = i + 1;
1501       high++;
1502       nrow++;
1503     noinsert:;
1504     }
1505     a->rlen[row] = nrow;
1506   }
1507   PetscFunctionReturn(0);
1508 }
1509 
1510 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) {
1511   PetscFunctionBegin;
1512   /* If the two matrices have the same copy implementation, use fast copy. */
1513   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1514     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1515     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1516 
1517     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1518     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1519   } else {
1520     PetscCall(MatCopy_Basic(A, B, str));
1521   }
1522   PetscFunctionReturn(0);
1523 }
1524 
1525 PetscErrorCode MatSetUp_SeqSELL(Mat A) {
1526   PetscFunctionBegin;
1527   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1528   PetscFunctionReturn(0);
1529 }
1530 
1531 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) {
1532   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1533 
1534   PetscFunctionBegin;
1535   *array = a->val;
1536   PetscFunctionReturn(0);
1537 }
1538 
1539 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) {
1540   PetscFunctionBegin;
1541   PetscFunctionReturn(0);
1542 }
1543 
1544 PetscErrorCode MatRealPart_SeqSELL(Mat A) {
1545   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1546   PetscInt     i;
1547   MatScalar   *aval = a->val;
1548 
1549   PetscFunctionBegin;
1550   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1551   PetscFunctionReturn(0);
1552 }
1553 
1554 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A) {
1555   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1556   PetscInt     i;
1557   MatScalar   *aval = a->val;
1558 
1559   PetscFunctionBegin;
1560   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1561   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1562   PetscFunctionReturn(0);
1563 }
1564 
1565 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) {
1566   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1567   MatScalar   *aval   = a->val;
1568   PetscScalar  oalpha = alpha;
1569   PetscBLASInt one    = 1, size;
1570 
1571   PetscFunctionBegin;
1572   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1573   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1574   PetscCall(PetscLogFlops(a->nz));
1575   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1576   PetscFunctionReturn(0);
1577 }
1578 
1579 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) {
1580   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1581 
1582   PetscFunctionBegin;
1583   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1584   PetscCall(MatShift_Basic(Y, a));
1585   PetscFunctionReturn(0);
1586 }
1587 
1588 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) {
1589   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1590   PetscScalar       *x, sum, *t;
1591   const MatScalar   *idiag = NULL, *mdiag;
1592   const PetscScalar *b, *xb;
1593   PetscInt           n, m = A->rmap->n, i, j, shift;
1594   const PetscInt    *diag;
1595 
1596   PetscFunctionBegin;
1597   its = its * lits;
1598 
1599   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1600   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1601   a->fshift = fshift;
1602   a->omega  = omega;
1603 
1604   diag  = a->diag;
1605   t     = a->ssor_work;
1606   idiag = a->idiag;
1607   mdiag = a->mdiag;
1608 
1609   PetscCall(VecGetArray(xx, &x));
1610   PetscCall(VecGetArrayRead(bb, &b));
1611   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1612   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1613   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1614   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1615 
1616   if (flag & SOR_ZERO_INITIAL_GUESS) {
1617     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1618       for (i = 0; i < m; i++) {
1619         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1620         sum   = b[i];
1621         n     = (diag[i] - shift) / 8;
1622         for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1623         t[i] = sum;
1624         x[i] = sum * idiag[i];
1625       }
1626       xb = t;
1627       PetscCall(PetscLogFlops(a->nz));
1628     } else xb = b;
1629     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1630       for (i = m - 1; i >= 0; i--) {
1631         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1632         sum   = xb[i];
1633         n     = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1634         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1635         if (xb == b) {
1636           x[i] = sum * idiag[i];
1637         } else {
1638           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1639         }
1640       }
1641       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1642     }
1643     its--;
1644   }
1645   while (its--) {
1646     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1647       for (i = 0; i < m; i++) {
1648         /* lower */
1649         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1650         sum   = b[i];
1651         n     = (diag[i] - shift) / 8;
1652         for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1653         t[i] = sum; /* save application of the lower-triangular part */
1654         /* upper */
1655         n    = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1656         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1657         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1658       }
1659       xb = t;
1660       PetscCall(PetscLogFlops(2.0 * a->nz));
1661     } else xb = b;
1662     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1663       for (i = m - 1; i >= 0; i--) {
1664         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1665         sum   = xb[i];
1666         if (xb == b) {
1667           /* whole matrix (no checkpointing available) */
1668           n = a->rlen[i];
1669           for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1670           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1671         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1672           n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1673           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1674           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1675         }
1676       }
1677       if (xb == b) {
1678         PetscCall(PetscLogFlops(2.0 * a->nz));
1679       } else {
1680         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1681       }
1682     }
1683   }
1684   PetscCall(VecRestoreArray(xx, &x));
1685   PetscCall(VecRestoreArrayRead(bb, &b));
1686   PetscFunctionReturn(0);
1687 }
1688 
1689 /* -------------------------------------------------------------------*/
1690 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1691                                        MatGetRow_SeqSELL,
1692                                        MatRestoreRow_SeqSELL,
1693                                        MatMult_SeqSELL,
1694                                        /* 4*/ MatMultAdd_SeqSELL,
1695                                        MatMultTranspose_SeqSELL,
1696                                        MatMultTransposeAdd_SeqSELL,
1697                                        NULL,
1698                                        NULL,
1699                                        NULL,
1700                                        /* 10*/ NULL,
1701                                        NULL,
1702                                        NULL,
1703                                        MatSOR_SeqSELL,
1704                                        NULL,
1705                                        /* 15*/ MatGetInfo_SeqSELL,
1706                                        MatEqual_SeqSELL,
1707                                        MatGetDiagonal_SeqSELL,
1708                                        MatDiagonalScale_SeqSELL,
1709                                        NULL,
1710                                        /* 20*/ NULL,
1711                                        MatAssemblyEnd_SeqSELL,
1712                                        MatSetOption_SeqSELL,
1713                                        MatZeroEntries_SeqSELL,
1714                                        /* 24*/ NULL,
1715                                        NULL,
1716                                        NULL,
1717                                        NULL,
1718                                        NULL,
1719                                        /* 29*/ MatSetUp_SeqSELL,
1720                                        NULL,
1721                                        NULL,
1722                                        NULL,
1723                                        NULL,
1724                                        /* 34*/ MatDuplicate_SeqSELL,
1725                                        NULL,
1726                                        NULL,
1727                                        NULL,
1728                                        NULL,
1729                                        /* 39*/ NULL,
1730                                        NULL,
1731                                        NULL,
1732                                        MatGetValues_SeqSELL,
1733                                        MatCopy_SeqSELL,
1734                                        /* 44*/ NULL,
1735                                        MatScale_SeqSELL,
1736                                        MatShift_SeqSELL,
1737                                        NULL,
1738                                        NULL,
1739                                        /* 49*/ NULL,
1740                                        NULL,
1741                                        NULL,
1742                                        NULL,
1743                                        NULL,
1744                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1745                                        NULL,
1746                                        NULL,
1747                                        NULL,
1748                                        NULL,
1749                                        /* 59*/ NULL,
1750                                        MatDestroy_SeqSELL,
1751                                        MatView_SeqSELL,
1752                                        NULL,
1753                                        NULL,
1754                                        /* 64*/ NULL,
1755                                        NULL,
1756                                        NULL,
1757                                        NULL,
1758                                        NULL,
1759                                        /* 69*/ NULL,
1760                                        NULL,
1761                                        NULL,
1762                                        NULL,
1763                                        NULL,
1764                                        /* 74*/ NULL,
1765                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1766                                        NULL,
1767                                        NULL,
1768                                        NULL,
1769                                        /* 79*/ NULL,
1770                                        NULL,
1771                                        NULL,
1772                                        NULL,
1773                                        NULL,
1774                                        /* 84*/ NULL,
1775                                        NULL,
1776                                        NULL,
1777                                        NULL,
1778                                        NULL,
1779                                        /* 89*/ NULL,
1780                                        NULL,
1781                                        NULL,
1782                                        NULL,
1783                                        NULL,
1784                                        /* 94*/ NULL,
1785                                        NULL,
1786                                        NULL,
1787                                        NULL,
1788                                        NULL,
1789                                        /* 99*/ NULL,
1790                                        NULL,
1791                                        NULL,
1792                                        MatConjugate_SeqSELL,
1793                                        NULL,
1794                                        /*104*/ NULL,
1795                                        NULL,
1796                                        NULL,
1797                                        NULL,
1798                                        NULL,
1799                                        /*109*/ NULL,
1800                                        NULL,
1801                                        NULL,
1802                                        NULL,
1803                                        MatMissingDiagonal_SeqSELL,
1804                                        /*114*/ NULL,
1805                                        NULL,
1806                                        NULL,
1807                                        NULL,
1808                                        NULL,
1809                                        /*119*/ NULL,
1810                                        NULL,
1811                                        NULL,
1812                                        NULL,
1813                                        NULL,
1814                                        /*124*/ NULL,
1815                                        NULL,
1816                                        NULL,
1817                                        NULL,
1818                                        NULL,
1819                                        /*129*/ NULL,
1820                                        NULL,
1821                                        NULL,
1822                                        NULL,
1823                                        NULL,
1824                                        /*134*/ NULL,
1825                                        NULL,
1826                                        NULL,
1827                                        NULL,
1828                                        NULL,
1829                                        /*139*/ NULL,
1830                                        NULL,
1831                                        NULL,
1832                                        MatFDColoringSetUp_SeqXAIJ,
1833                                        NULL,
1834                                        /*144*/ NULL,
1835                                        NULL,
1836                                        NULL,
1837                                        NULL,
1838                                        NULL,
1839                                        NULL,
1840                                        /*150*/ NULL};
1841 
1842 PetscErrorCode MatStoreValues_SeqSELL(Mat mat) {
1843   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1844 
1845   PetscFunctionBegin;
1846   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1847 
1848   /* allocate space for values if not already there */
1849   if (!a->saved_values) { PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); }
1850 
1851   /* copy values over */
1852   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1853   PetscFunctionReturn(0);
1854 }
1855 
1856 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) {
1857   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1858 
1859   PetscFunctionBegin;
1860   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1861   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1862   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1863   PetscFunctionReturn(0);
1864 }
1865 
1866 /*@C
1867  MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()`
1868 
1869  Not Collective
1870 
1871  Input Parameters:
1872  .  mat - a `MATSEQSELL` matrix
1873  .  array - pointer to the data
1874 
1875  Level: intermediate
1876 
1877  .seealso: `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
1878  @*/
1879 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array) {
1880   PetscFunctionBegin;
1881   PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
1882   PetscFunctionReturn(0);
1883 }
1884 
1885 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) {
1886   Mat_SeqSELL *b;
1887   PetscMPIInt  size;
1888 
1889   PetscFunctionBegin;
1890   PetscCall(PetscCitationsRegister(citation, &cited));
1891   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
1892   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
1893 
1894   PetscCall(PetscNew(&b));
1895 
1896   B->data = (void *)b;
1897 
1898   PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps)));
1899 
1900   b->row                = NULL;
1901   b->col                = NULL;
1902   b->icol               = NULL;
1903   b->reallocs           = 0;
1904   b->ignorezeroentries  = PETSC_FALSE;
1905   b->roworiented        = PETSC_TRUE;
1906   b->nonew              = 0;
1907   b->diag               = NULL;
1908   b->solve_work         = NULL;
1909   B->spptr              = NULL;
1910   b->saved_values       = NULL;
1911   b->idiag              = NULL;
1912   b->mdiag              = NULL;
1913   b->ssor_work          = NULL;
1914   b->omega              = 1.0;
1915   b->fshift             = 0.0;
1916   b->idiagvalid         = PETSC_FALSE;
1917   b->keepnonzeropattern = PETSC_FALSE;
1918 
1919   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
1920   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
1921   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
1922   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
1923   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
1924   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
1925   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
1926   PetscFunctionReturn(0);
1927 }
1928 
1929 /*
1930  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1931  */
1932 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) {
1933   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
1934   PetscInt     i, m                           = A->rmap->n;
1935   PetscInt     totalslices = a->totalslices;
1936 
1937   PetscFunctionBegin;
1938   C->factortype = A->factortype;
1939   c->row        = NULL;
1940   c->col        = NULL;
1941   c->icol       = NULL;
1942   c->reallocs   = 0;
1943   C->assembled  = PETSC_TRUE;
1944 
1945   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
1946   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
1947 
1948   PetscCall(PetscMalloc1(8 * totalslices, &c->rlen));
1949   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
1950 
1951   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
1952   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
1953 
1954   /* allocate the matrix space */
1955   if (mallocmatspace) {
1956     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
1957 
1958     c->singlemalloc = PETSC_TRUE;
1959 
1960     if (m > 0) {
1961       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
1962       if (cpvalues == MAT_COPY_VALUES) {
1963         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
1964       } else {
1965         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
1966       }
1967     }
1968   }
1969 
1970   c->ignorezeroentries = a->ignorezeroentries;
1971   c->roworiented       = a->roworiented;
1972   c->nonew             = a->nonew;
1973   if (a->diag) {
1974     PetscCall(PetscMalloc1(m, &c->diag));
1975     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
1976   } else c->diag = NULL;
1977 
1978   c->solve_work         = NULL;
1979   c->saved_values       = NULL;
1980   c->idiag              = NULL;
1981   c->ssor_work          = NULL;
1982   c->keepnonzeropattern = a->keepnonzeropattern;
1983   c->free_val           = PETSC_TRUE;
1984   c->free_colidx        = PETSC_TRUE;
1985 
1986   c->maxallocmat  = a->maxallocmat;
1987   c->maxallocrow  = a->maxallocrow;
1988   c->rlenmax      = a->rlenmax;
1989   c->nz           = a->nz;
1990   C->preallocated = PETSC_TRUE;
1991 
1992   c->nonzerorowcnt = a->nonzerorowcnt;
1993   C->nonzerostate  = A->nonzerostate;
1994 
1995   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
1996   PetscFunctionReturn(0);
1997 }
1998 
1999 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) {
2000   PetscFunctionBegin;
2001   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2002   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2003   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2004   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2005   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2006   PetscFunctionReturn(0);
2007 }
2008 
2009 /*MC
2010    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2011    based on the sliced Ellpack format
2012 
2013    Options Database Keys:
2014 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2015 
2016    Level: beginner
2017 
2018 .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2019 M*/
2020 
2021 /*MC
2022    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2023 
2024    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2025    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2026   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2027   for communicators controlling multiple processes.  It is recommended that you call both of
2028   the above preallocation routines for simplicity.
2029 
2030    Options Database Keys:
2031 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2032 
2033   Level: beginner
2034 
2035   Notes:
2036    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2037 
2038    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2039    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2040 
2041   Developer Notes:
2042    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2043 
2044    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2045 .vb
2046                             (2 0  3 4)
2047    Consider the matrix A =  (5 0  6 0)
2048                             (0 0  7 8)
2049                             (0 0  9 9)
2050 
2051    symbolically the Ellpack format can be written as
2052 
2053         (2 3 4 |)           (0 2 3 |)
2054    v =  (5 6 0 |)  colidx = (0 2 2 |)
2055         --------            ---------
2056         (7 8 |)             (2 3 |)
2057         (9 9 |)             (2 3 |)
2058 
2059     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2060     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2061     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2062 
2063     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2064 
2065 .ve
2066 
2067       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2068 
2069  References:
2070 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2071    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2072 
2073 .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2074 M*/
2075 
2076 /*@C
2077        MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2078 
2079  Collective on comm
2080 
2081  Input Parameters:
2082 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
2083 .  m - number of rows
2084 .  n - number of columns
2085 .  rlenmax - maximum number of nonzeros in a row
2086 -  rlen - array containing the number of nonzeros in the various rows
2087  (possibly different for each row) or NULL
2088 
2089  Output Parameter:
2090 .  A - the matrix
2091 
2092  It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2093  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2094  [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2095 
2096  Notes:
2097  If nnz is given then nz is ignored
2098 
2099  Specify the preallocated storage with either rlenmax or rlen (not both).
2100  Set rlenmax = `PETSC_DEFAULT` and rlen = NULL for PETSc to control dynamic memory
2101  allocation.  For large problems you MUST preallocate memory or you
2102  will get TERRIBLE performance, see the users' manual chapter on matrices.
2103 
2104  Level: intermediate
2105 
2106  .seealso: `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2107  @*/
2108 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt maxallocrow, const PetscInt rlen[], Mat *A) {
2109   PetscFunctionBegin;
2110   PetscCall(MatCreate(comm, A));
2111   PetscCall(MatSetSizes(*A, m, n, m, n));
2112   PetscCall(MatSetType(*A, MATSEQSELL));
2113   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, maxallocrow, rlen));
2114   PetscFunctionReturn(0);
2115 }
2116 
2117 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) {
2118   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2119   PetscInt     totalslices = a->totalslices;
2120 
2121   PetscFunctionBegin;
2122   /* If the  matrix dimensions are not equal,or no of nonzeros */
2123   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2124     *flg = PETSC_FALSE;
2125     PetscFunctionReturn(0);
2126   }
2127   /* if the a->colidx are the same */
2128   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2129   if (!*flg) PetscFunctionReturn(0);
2130   /* if a->val are the same */
2131   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2132   PetscFunctionReturn(0);
2133 }
2134 
2135 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) {
2136   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2137 
2138   PetscFunctionBegin;
2139   a->idiagvalid = PETSC_FALSE;
2140   PetscFunctionReturn(0);
2141 }
2142 
2143 PetscErrorCode MatConjugate_SeqSELL(Mat A) {
2144 #if defined(PETSC_USE_COMPLEX)
2145   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2146   PetscInt     i;
2147   PetscScalar *val = a->val;
2148 
2149   PetscFunctionBegin;
2150   for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]);
2151 #else
2152   PetscFunctionBegin;
2153 #endif
2154   PetscFunctionReturn(0);
2155 }
2156