xref: /petsc/src/mat/impls/sell/seq/sell.c (revision ec34bba6ec188c32e12dca9a8cfb2fb88bebbee0)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited      = PETSC_FALSE;
10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14                                " year = 2018\n"
15                                "}\n";
16 
17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
18 
19   #include <immintrin.h>
20 
21   #if !defined(_MM_SCALE_8)
22     #define _MM_SCALE_8 8
23   #endif
24 
25   #if defined(__AVX512F__)
26     /* these do not work
27    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29   */
30     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
33       vec_vals = _mm512_loadu_pd(aval); \
34       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36   #elif defined(__AVX2__) && defined(__FMA__)
37     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38       vec_vals = _mm256_loadu_pd(aval); \
39       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42   #endif
43 #endif /* PETSC_HAVE_IMMINTRIN_H */
44 
45 /*@C
46  MatSeqSELLSetPreallocation - For good matrix assembly performance
47  the user should preallocate the matrix storage by setting the parameter nz
48  (or the array nnz).  By setting these parameters accurately, performance
49  during matrix assembly can be increased significantly.
50 
51  Collective
52 
53  Input Parameters:
54  +  B - The `MATSEQSELL` matrix
55  .  rlenmax - number of nonzeros per row (same for all rows)
56  -  rlen - array containing the number of nonzeros in the various rows
57  (possibly different for each row) or `NULL`
58 
59  Level: intermediate
60 
61  Notes:
62  If `rlen` is given then `rlenmax` is ignored.
63 
64  Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
65  Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
66  allocation.  For large problems you MUST preallocate memory or you
67  will get TERRIBLE performance, see the users' manual chapter on matrices.
68 
69  You can call `MatGetInfo()` to get information on how effective the preallocation was;
70  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
71  You can also run with the option `-info` and look for messages with the string
72  malloc in them to see if additional memory allocation was needed.
73 
74  Developers Note:
75  Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
76  entries or columns indices.
77 
78  The maximum number of nonzeos in any row should be as accurate as possible.
79  If it is underestimated, you will get bad performance due to reallocation
80  (`MatSeqXSELLReallocateSELL()`).
81 
82  .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
83  @*/
84 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
85 {
86   PetscFunctionBegin;
87   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
88   PetscValidType(B, 1);
89   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
90   PetscFunctionReturn(PETSC_SUCCESS);
91 }
92 
93 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
94 {
95   Mat_SeqSELL *b;
96   PetscInt     i, j, totalslices;
97   PetscBool    skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
98 
99   PetscFunctionBegin;
100   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
101   if (maxallocrow == MAT_SKIP_ALLOCATION) {
102     skipallocation = PETSC_TRUE;
103     maxallocrow    = 0;
104   }
105 
106   PetscCall(PetscLayoutSetUp(B->rmap));
107   PetscCall(PetscLayoutSetUp(B->cmap));
108 
109   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
110   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
111   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
112   if (rlen) {
113     for (i = 0; i < B->rmap->n; i++) {
114       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
115       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
116     }
117   }
118 
119   B->preallocated = PETSC_TRUE;
120 
121   b = (Mat_SeqSELL *)B->data;
122 
123   totalslices    = PetscCeilInt(B->rmap->n, 8);
124   b->totalslices = totalslices;
125   if (!skipallocation) {
126     if (B->rmap->n & 0x07) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n));
127 
128     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
129       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
130     }
131     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
132       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
133       else if (maxallocrow < 0) maxallocrow = 1;
134       for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow;
135     } else {
136       maxallocrow  = 0;
137       b->sliidx[0] = 0;
138       for (i = 1; i < totalslices; i++) {
139         b->sliidx[i] = 0;
140         for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]);
141         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
142         PetscCall(PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i]));
143       }
144       /* last slice */
145       b->sliidx[totalslices] = 0;
146       for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
147       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
148       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices];
149     }
150 
151     /* allocate space for val, colidx, rlen */
152     /* FIXME: should B's old memory be unlogged? */
153     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
154     /* FIXME: assuming an element of the bit array takes 8 bits */
155     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
156     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
157     PetscCall(PetscCalloc1(8 * totalslices, &b->rlen));
158 
159     b->singlemalloc = PETSC_TRUE;
160     b->free_val     = PETSC_TRUE;
161     b->free_colidx  = PETSC_TRUE;
162   } else {
163     b->free_val    = PETSC_FALSE;
164     b->free_colidx = PETSC_FALSE;
165   }
166 
167   b->nz               = 0;
168   b->maxallocrow      = maxallocrow;
169   b->rlenmax          = maxallocrow;
170   b->maxallocmat      = b->sliidx[totalslices];
171   B->info.nz_unneeded = (double)b->maxallocmat;
172   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
173   PetscFunctionReturn(PETSC_SUCCESS);
174 }
175 
176 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
177 {
178   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
179   PetscInt     shift;
180 
181   PetscFunctionBegin;
182   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
183   if (nz) *nz = a->rlen[row];
184   shift = a->sliidx[row >> 3] + (row & 0x07);
185   if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals));
186   if (idx) {
187     PetscInt j;
188     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j];
189     *idx = a->getrowcols;
190   }
191   if (v) {
192     PetscInt j;
193     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j];
194     *v = a->getrowvals;
195   }
196   PetscFunctionReturn(PETSC_SUCCESS);
197 }
198 
199 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
200 {
201   PetscFunctionBegin;
202   PetscFunctionReturn(PETSC_SUCCESS);
203 }
204 
205 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
206 {
207   Mat          B;
208   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
209   PetscInt     i;
210 
211   PetscFunctionBegin;
212   if (reuse == MAT_REUSE_MATRIX) {
213     B = *newmat;
214     PetscCall(MatZeroEntries(B));
215   } else {
216     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
217     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
218     PetscCall(MatSetType(B, MATSEQAIJ));
219     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
220   }
221 
222   for (i = 0; i < A->rmap->n; i++) {
223     PetscInt     nz = 0, *cols = NULL;
224     PetscScalar *vals = NULL;
225 
226     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
227     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
228     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
229   }
230 
231   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
232   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
233   B->rmap->bs = A->rmap->bs;
234 
235   if (reuse == MAT_INPLACE_MATRIX) {
236     PetscCall(MatHeaderReplace(A, &B));
237   } else {
238     *newmat = B;
239   }
240   PetscFunctionReturn(PETSC_SUCCESS);
241 }
242 
243 #include <../src/mat/impls/aij/seq/aij.h>
244 
245 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
246 {
247   Mat                B;
248   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
249   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
250   const PetscInt    *cols;
251   const PetscScalar *vals;
252 
253   PetscFunctionBegin;
254 
255   if (reuse == MAT_REUSE_MATRIX) {
256     B = *newmat;
257   } else {
258     if (PetscDefined(USE_DEBUG) || !a->ilen) {
259       PetscCall(PetscMalloc1(m, &rowlengths));
260       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
261     }
262     if (PetscDefined(USE_DEBUG) && a->ilen) {
263       PetscBool eq;
264       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
265       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
266       PetscCall(PetscFree(rowlengths));
267       rowlengths = a->ilen;
268     } else if (a->ilen) rowlengths = a->ilen;
269     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
270     PetscCall(MatSetSizes(B, m, n, m, n));
271     PetscCall(MatSetType(B, MATSEQSELL));
272     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
273     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
274   }
275 
276   for (row = 0; row < m; row++) {
277     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
278     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
279     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
280   }
281   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
282   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
283   B->rmap->bs = A->rmap->bs;
284 
285   if (reuse == MAT_INPLACE_MATRIX) {
286     PetscCall(MatHeaderReplace(A, &B));
287   } else {
288     *newmat = B;
289   }
290   PetscFunctionReturn(PETSC_SUCCESS);
291 }
292 
293 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
294 {
295   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
296   PetscScalar       *y;
297   const PetscScalar *x;
298   const MatScalar   *aval        = a->val;
299   PetscInt           totalslices = a->totalslices;
300   const PetscInt    *acolidx     = a->colidx;
301   PetscInt           i, j;
302 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
303   __m512d  vec_x, vec_y, vec_vals;
304   __m256i  vec_idx;
305   __mmask8 mask;
306   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
307   __m256i  vec_idx2, vec_idx3, vec_idx4;
308 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
309   __m128i   vec_idx;
310   __m256d   vec_x, vec_y, vec_y2, vec_vals;
311   MatScalar yval;
312   PetscInt  r, rows_left, row, nnz_in_row;
313 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
314   __m128d   vec_x_tmp;
315   __m256d   vec_x, vec_y, vec_y2, vec_vals;
316   MatScalar yval;
317   PetscInt  r, rows_left, row, nnz_in_row;
318 #else
319   PetscScalar sum[8];
320 #endif
321 
322 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
323   #pragma disjoint(*x, *y, *aval)
324 #endif
325 
326   PetscFunctionBegin;
327   PetscCall(VecGetArrayRead(xx, &x));
328   PetscCall(VecGetArray(yy, &y));
329 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
330   for (i = 0; i < totalslices; i++) { /* loop over slices */
331     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
332     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
333 
334     vec_y  = _mm512_setzero_pd();
335     vec_y2 = _mm512_setzero_pd();
336     vec_y3 = _mm512_setzero_pd();
337     vec_y4 = _mm512_setzero_pd();
338 
339     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
340     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
341     case 3:
342       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
343       acolidx += 8;
344       aval += 8;
345       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
346       acolidx += 8;
347       aval += 8;
348       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
349       acolidx += 8;
350       aval += 8;
351       j += 3;
352       break;
353     case 2:
354       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
355       acolidx += 8;
356       aval += 8;
357       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
358       acolidx += 8;
359       aval += 8;
360       j += 2;
361       break;
362     case 1:
363       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
364       acolidx += 8;
365       aval += 8;
366       j += 1;
367       break;
368     }
369   #pragma novector
370     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
371       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
372       acolidx += 8;
373       aval += 8;
374       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
375       acolidx += 8;
376       aval += 8;
377       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
378       acolidx += 8;
379       aval += 8;
380       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
381       acolidx += 8;
382       aval += 8;
383     }
384 
385     vec_y = _mm512_add_pd(vec_y, vec_y2);
386     vec_y = _mm512_add_pd(vec_y, vec_y3);
387     vec_y = _mm512_add_pd(vec_y, vec_y4);
388     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
389       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
390       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
391     } else {
392       _mm512_storeu_pd(&y[8 * i], vec_y);
393     }
394   }
395 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
396   for (i = 0; i < totalslices; i++) { /* loop over full slices */
397     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
398     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
399 
400     /* last slice may have padding rows. Don't use vectorization. */
401     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
402       rows_left = A->rmap->n - 8 * i;
403       for (r = 0; r < rows_left; ++r) {
404         yval       = (MatScalar)0;
405         row        = 8 * i + r;
406         nnz_in_row = a->rlen[row];
407         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
408         y[row] = yval;
409       }
410       break;
411     }
412 
413     vec_y  = _mm256_setzero_pd();
414     vec_y2 = _mm256_setzero_pd();
415 
416   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
417   #pragma novector
418   #pragma unroll(2)
419     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
420       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
421       aval += 4;
422       acolidx += 4;
423       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
424       aval += 4;
425       acolidx += 4;
426     }
427 
428     _mm256_storeu_pd(y + i * 8, vec_y);
429     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
430   }
431 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
432   for (i = 0; i < totalslices; i++) { /* loop over full slices */
433     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
434     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
435 
436     vec_y  = _mm256_setzero_pd();
437     vec_y2 = _mm256_setzero_pd();
438 
439     /* last slice may have padding rows. Don't use vectorization. */
440     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
441       rows_left = A->rmap->n - 8 * i;
442       for (r = 0; r < rows_left; ++r) {
443         yval       = (MatScalar)0;
444         row        = 8 * i + r;
445         nnz_in_row = a->rlen[row];
446         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
447         y[row] = yval;
448       }
449       break;
450     }
451 
452   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
453   #pragma novector
454   #pragma unroll(2)
455     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
456       vec_vals  = _mm256_loadu_pd(aval);
457       vec_x_tmp = _mm_setzero_pd();
458       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
459       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
460       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
461       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
462       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
463       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
464       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
465       aval += 4;
466 
467       vec_vals  = _mm256_loadu_pd(aval);
468       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
469       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
470       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
471       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
472       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
473       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
474       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
475       aval += 4;
476     }
477 
478     _mm256_storeu_pd(y + i * 8, vec_y);
479     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
480   }
481 #else
482   for (i = 0; i < totalslices; i++) { /* loop over slices */
483     for (j = 0; j < 8; j++) sum[j] = 0.0;
484     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
485       sum[0] += aval[j] * x[acolidx[j]];
486       sum[1] += aval[j + 1] * x[acolidx[j + 1]];
487       sum[2] += aval[j + 2] * x[acolidx[j + 2]];
488       sum[3] += aval[j + 3] * x[acolidx[j + 3]];
489       sum[4] += aval[j + 4] * x[acolidx[j + 4]];
490       sum[5] += aval[j + 5] * x[acolidx[j + 5]];
491       sum[6] += aval[j + 6] * x[acolidx[j + 6]];
492       sum[7] += aval[j + 7] * x[acolidx[j + 7]];
493     }
494     if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
495       for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j];
496     } else {
497       for (j = 0; j < 8; j++) y[8 * i + j] = sum[j];
498     }
499   }
500 #endif
501 
502   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
503   PetscCall(VecRestoreArrayRead(xx, &x));
504   PetscCall(VecRestoreArray(yy, &y));
505   PetscFunctionReturn(PETSC_SUCCESS);
506 }
507 
508 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
509 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
510 {
511   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
512   PetscScalar       *y, *z;
513   const PetscScalar *x;
514   const MatScalar   *aval        = a->val;
515   PetscInt           totalslices = a->totalslices;
516   const PetscInt    *acolidx     = a->colidx;
517   PetscInt           i, j;
518 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
519   __m512d  vec_x, vec_y, vec_vals;
520   __m256i  vec_idx;
521   __mmask8 mask;
522   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
523   __m256i  vec_idx2, vec_idx3, vec_idx4;
524 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
525   __m128d   vec_x_tmp;
526   __m256d   vec_x, vec_y, vec_y2, vec_vals;
527   MatScalar yval;
528   PetscInt  r, row, nnz_in_row;
529 #else
530   PetscScalar sum[8];
531 #endif
532 
533 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
534   #pragma disjoint(*x, *y, *aval)
535 #endif
536 
537   PetscFunctionBegin;
538   PetscCall(VecGetArrayRead(xx, &x));
539   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
540 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
541   for (i = 0; i < totalslices; i++) { /* loop over slices */
542     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
543     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
544 
545     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
546       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
547       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
548     } else {
549       vec_y = _mm512_loadu_pd(&y[8 * i]);
550     }
551     vec_y2 = _mm512_setzero_pd();
552     vec_y3 = _mm512_setzero_pd();
553     vec_y4 = _mm512_setzero_pd();
554 
555     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
556     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
557     case 3:
558       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
559       acolidx += 8;
560       aval += 8;
561       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
562       acolidx += 8;
563       aval += 8;
564       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
565       acolidx += 8;
566       aval += 8;
567       j += 3;
568       break;
569     case 2:
570       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
571       acolidx += 8;
572       aval += 8;
573       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
574       acolidx += 8;
575       aval += 8;
576       j += 2;
577       break;
578     case 1:
579       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
580       acolidx += 8;
581       aval += 8;
582       j += 1;
583       break;
584     }
585   #pragma novector
586     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
587       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
588       acolidx += 8;
589       aval += 8;
590       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
591       acolidx += 8;
592       aval += 8;
593       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
594       acolidx += 8;
595       aval += 8;
596       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
597       acolidx += 8;
598       aval += 8;
599     }
600 
601     vec_y = _mm512_add_pd(vec_y, vec_y2);
602     vec_y = _mm512_add_pd(vec_y, vec_y3);
603     vec_y = _mm512_add_pd(vec_y, vec_y4);
604     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
605       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
606     } else {
607       _mm512_storeu_pd(&z[8 * i], vec_y);
608     }
609   }
610 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
611   for (i = 0; i < totalslices; i++) { /* loop over full slices */
612     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
613     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
614 
615     /* last slice may have padding rows. Don't use vectorization. */
616     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
617       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
618         row        = 8 * i + r;
619         yval       = (MatScalar)0.0;
620         nnz_in_row = a->rlen[row];
621         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
622         z[row] = y[row] + yval;
623       }
624       break;
625     }
626 
627     vec_y  = _mm256_loadu_pd(y + 8 * i);
628     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
629 
630     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
631     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
632       vec_vals  = _mm256_loadu_pd(aval);
633       vec_x_tmp = _mm_setzero_pd();
634       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
635       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
636       vec_x     = _mm256_setzero_pd();
637       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
638       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
639       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
640       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
641       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
642       aval += 4;
643 
644       vec_vals  = _mm256_loadu_pd(aval);
645       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
646       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
647       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
648       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
649       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
650       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
651       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
652       aval += 4;
653     }
654 
655     _mm256_storeu_pd(z + i * 8, vec_y);
656     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
657   }
658 #else
659   for (i = 0; i < totalslices; i++) { /* loop over slices */
660     for (j = 0; j < 8; j++) sum[j] = 0.0;
661     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
662       sum[0] += aval[j] * x[acolidx[j]];
663       sum[1] += aval[j + 1] * x[acolidx[j + 1]];
664       sum[2] += aval[j + 2] * x[acolidx[j + 2]];
665       sum[3] += aval[j + 3] * x[acolidx[j + 3]];
666       sum[4] += aval[j + 4] * x[acolidx[j + 4]];
667       sum[5] += aval[j + 5] * x[acolidx[j + 5]];
668       sum[6] += aval[j + 6] * x[acolidx[j + 6]];
669       sum[7] += aval[j + 7] * x[acolidx[j + 7]];
670     }
671     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
672       for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j];
673     } else {
674       for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j];
675     }
676   }
677 #endif
678 
679   PetscCall(PetscLogFlops(2.0 * a->nz));
680   PetscCall(VecRestoreArrayRead(xx, &x));
681   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
682   PetscFunctionReturn(PETSC_SUCCESS);
683 }
684 
685 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
686 {
687   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
688   PetscScalar       *y;
689   const PetscScalar *x;
690   const MatScalar   *aval    = a->val;
691   const PetscInt    *acolidx = a->colidx;
692   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices;
693 
694 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
695   #pragma disjoint(*x, *y, *aval)
696 #endif
697 
698   PetscFunctionBegin;
699   if (A->symmetric == PETSC_BOOL3_TRUE) {
700     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
701     PetscFunctionReturn(PETSC_SUCCESS);
702   }
703   if (zz != yy) PetscCall(VecCopy(zz, yy));
704   PetscCall(VecGetArrayRead(xx, &x));
705   PetscCall(VecGetArray(yy, &y));
706   for (i = 0; i < a->totalslices; i++) { /* loop over slices */
707     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
708       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
709         row        = 8 * i + r;
710         nnz_in_row = a->rlen[row];
711         for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row];
712       }
713       break;
714     }
715     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
716       y[acolidx[j]] += aval[j] * x[8 * i];
717       y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1];
718       y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2];
719       y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3];
720       y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4];
721       y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5];
722       y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6];
723       y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7];
724     }
725   }
726   PetscCall(PetscLogFlops(2.0 * a->sliidx[a->totalslices]));
727   PetscCall(VecRestoreArrayRead(xx, &x));
728   PetscCall(VecRestoreArray(yy, &y));
729   PetscFunctionReturn(PETSC_SUCCESS);
730 }
731 
732 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
733 {
734   PetscFunctionBegin;
735   if (A->symmetric == PETSC_BOOL3_TRUE) {
736     PetscCall(MatMult_SeqSELL(A, xx, yy));
737   } else {
738     PetscCall(VecSet(yy, 0.0));
739     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
740   }
741   PetscFunctionReturn(PETSC_SUCCESS);
742 }
743 
744 /*
745      Checks for missing diagonals
746 */
747 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
748 {
749   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
750   PetscInt    *diag, i;
751 
752   PetscFunctionBegin;
753   *missing = PETSC_FALSE;
754   if (A->rmap->n > 0 && !(a->colidx)) {
755     *missing = PETSC_TRUE;
756     if (d) *d = 0;
757     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
758   } else {
759     diag = a->diag;
760     for (i = 0; i < A->rmap->n; i++) {
761       if (diag[i] == -1) {
762         *missing = PETSC_TRUE;
763         if (d) *d = i;
764         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
765         break;
766       }
767     }
768   }
769   PetscFunctionReturn(PETSC_SUCCESS);
770 }
771 
772 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
773 {
774   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
775   PetscInt     i, j, m = A->rmap->n, shift;
776 
777   PetscFunctionBegin;
778   if (!a->diag) {
779     PetscCall(PetscMalloc1(m, &a->diag));
780     a->free_diag = PETSC_TRUE;
781   }
782   for (i = 0; i < m; i++) {                      /* loop over rows */
783     shift      = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
784     a->diag[i] = -1;
785     for (j = 0; j < a->rlen[i]; j++) {
786       if (a->colidx[shift + j * 8] == i) {
787         a->diag[i] = shift + j * 8;
788         break;
789       }
790     }
791   }
792   PetscFunctionReturn(PETSC_SUCCESS);
793 }
794 
795 /*
796   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
797 */
798 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
799 {
800   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
801   PetscInt     i, *diag, m = A->rmap->n;
802   MatScalar   *val = a->val;
803   PetscScalar *idiag, *mdiag;
804 
805   PetscFunctionBegin;
806   if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS);
807   PetscCall(MatMarkDiagonal_SeqSELL(A));
808   diag = a->diag;
809   if (!a->idiag) {
810     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
811     val = a->val;
812   }
813   mdiag = a->mdiag;
814   idiag = a->idiag;
815 
816   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
817     for (i = 0; i < m; i++) {
818       mdiag[i] = val[diag[i]];
819       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
820         PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
821         PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
822         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
823         A->factorerror_zeropivot_value = 0.0;
824         A->factorerror_zeropivot_row   = i;
825       }
826       idiag[i] = 1.0 / val[diag[i]];
827     }
828     PetscCall(PetscLogFlops(m));
829   } else {
830     for (i = 0; i < m; i++) {
831       mdiag[i] = val[diag[i]];
832       idiag[i] = omega / (fshift + val[diag[i]]);
833     }
834     PetscCall(PetscLogFlops(2.0 * m));
835   }
836   a->idiagvalid = PETSC_TRUE;
837   PetscFunctionReturn(PETSC_SUCCESS);
838 }
839 
840 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
841 {
842   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
843 
844   PetscFunctionBegin;
845   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
846   PetscCall(MatSeqSELLInvalidateDiagonal(A));
847   PetscFunctionReturn(PETSC_SUCCESS);
848 }
849 
850 PetscErrorCode MatDestroy_SeqSELL(Mat A)
851 {
852   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
853 
854   PetscFunctionBegin;
855 #if defined(PETSC_USE_LOG)
856   PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
857 #endif
858   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
859   PetscCall(ISDestroy(&a->row));
860   PetscCall(ISDestroy(&a->col));
861   PetscCall(PetscFree(a->diag));
862   PetscCall(PetscFree(a->rlen));
863   PetscCall(PetscFree(a->sliidx));
864   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
865   PetscCall(PetscFree(a->solve_work));
866   PetscCall(ISDestroy(&a->icol));
867   PetscCall(PetscFree(a->saved_values));
868   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
869 
870   PetscCall(PetscFree(A->data));
871 
872   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
873   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
874   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
875   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
876   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
877   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
878   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
879   PetscFunctionReturn(PETSC_SUCCESS);
880 }
881 
882 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
883 {
884   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
885 
886   PetscFunctionBegin;
887   switch (op) {
888   case MAT_ROW_ORIENTED:
889     a->roworiented = flg;
890     break;
891   case MAT_KEEP_NONZERO_PATTERN:
892     a->keepnonzeropattern = flg;
893     break;
894   case MAT_NEW_NONZERO_LOCATIONS:
895     a->nonew = (flg ? 0 : 1);
896     break;
897   case MAT_NEW_NONZERO_LOCATION_ERR:
898     a->nonew = (flg ? -1 : 0);
899     break;
900   case MAT_NEW_NONZERO_ALLOCATION_ERR:
901     a->nonew = (flg ? -2 : 0);
902     break;
903   case MAT_UNUSED_NONZERO_LOCATION_ERR:
904     a->nounused = (flg ? -1 : 0);
905     break;
906   case MAT_FORCE_DIAGONAL_ENTRIES:
907   case MAT_IGNORE_OFF_PROC_ENTRIES:
908   case MAT_USE_HASH_TABLE:
909   case MAT_SORTED_FULL:
910     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
911     break;
912   case MAT_SPD:
913   case MAT_SYMMETRIC:
914   case MAT_STRUCTURALLY_SYMMETRIC:
915   case MAT_HERMITIAN:
916   case MAT_SYMMETRY_ETERNAL:
917   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
918   case MAT_SPD_ETERNAL:
919     /* These options are handled directly by MatSetOption() */
920     break;
921   default:
922     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
923   }
924   PetscFunctionReturn(PETSC_SUCCESS);
925 }
926 
927 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
928 {
929   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
930   PetscInt     i, j, n, shift;
931   PetscScalar *x, zero = 0.0;
932 
933   PetscFunctionBegin;
934   PetscCall(VecGetLocalSize(v, &n));
935   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
936 
937   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
938     PetscInt *diag = a->diag;
939     PetscCall(VecGetArray(v, &x));
940     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
941     PetscCall(VecRestoreArray(v, &x));
942     PetscFunctionReturn(PETSC_SUCCESS);
943   }
944 
945   PetscCall(VecSet(v, zero));
946   PetscCall(VecGetArray(v, &x));
947   for (i = 0; i < n; i++) {                 /* loop over rows */
948     shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
949     x[i]  = 0;
950     for (j = 0; j < a->rlen[i]; j++) {
951       if (a->colidx[shift + j * 8] == i) {
952         x[i] = a->val[shift + j * 8];
953         break;
954       }
955     }
956   }
957   PetscCall(VecRestoreArray(v, &x));
958   PetscFunctionReturn(PETSC_SUCCESS);
959 }
960 
961 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
962 {
963   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
964   const PetscScalar *l, *r;
965   PetscInt           i, j, m, n, row;
966 
967   PetscFunctionBegin;
968   if (ll) {
969     /* The local size is used so that VecMPI can be passed to this routine
970        by MatDiagonalScale_MPISELL */
971     PetscCall(VecGetLocalSize(ll, &m));
972     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
973     PetscCall(VecGetArrayRead(ll, &l));
974     for (i = 0; i < a->totalslices; i++) {                  /* loop over slices */
975       if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
976         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
977           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row];
978         }
979       } else {
980         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row];
981       }
982     }
983     PetscCall(VecRestoreArrayRead(ll, &l));
984     PetscCall(PetscLogFlops(a->nz));
985   }
986   if (rr) {
987     PetscCall(VecGetLocalSize(rr, &n));
988     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
989     PetscCall(VecGetArrayRead(rr, &r));
990     for (i = 0; i < a->totalslices; i++) {                  /* loop over slices */
991       if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
992         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
993           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
994         }
995       } else {
996         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
997       }
998     }
999     PetscCall(VecRestoreArrayRead(rr, &r));
1000     PetscCall(PetscLogFlops(a->nz));
1001   }
1002   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1003   PetscFunctionReturn(PETSC_SUCCESS);
1004 }
1005 
1006 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1007 {
1008   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1009   PetscInt    *cp, i, k, low, high, t, row, col, l;
1010   PetscInt     shift;
1011   MatScalar   *vp;
1012 
1013   PetscFunctionBegin;
1014   for (k = 0; k < m; k++) { /* loop over requested rows */
1015     row = im[k];
1016     if (row < 0) continue;
1017     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1018     shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1019     cp    = a->colidx + shift;                  /* pointer to the row */
1020     vp    = a->val + shift;                     /* pointer to the row */
1021     for (l = 0; l < n; l++) {                   /* loop over requested columns */
1022       col = in[l];
1023       if (col < 0) continue;
1024       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1025       high = a->rlen[row];
1026       low  = 0; /* assume unsorted */
1027       while (high - low > 5) {
1028         t = (low + high) / 2;
1029         if (*(cp + t * 8) > col) high = t;
1030         else low = t;
1031       }
1032       for (i = low; i < high; i++) {
1033         if (*(cp + 8 * i) > col) break;
1034         if (*(cp + 8 * i) == col) {
1035           *v++ = *(vp + 8 * i);
1036           goto finished;
1037         }
1038       }
1039       *v++ = 0.0;
1040     finished:;
1041     }
1042   }
1043   PetscFunctionReturn(PETSC_SUCCESS);
1044 }
1045 
1046 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1047 {
1048   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1049   PetscInt          i, j, m = A->rmap->n, shift;
1050   const char       *name;
1051   PetscViewerFormat format;
1052 
1053   PetscFunctionBegin;
1054   PetscCall(PetscViewerGetFormat(viewer, &format));
1055   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1056     PetscInt nofinalvalue = 0;
1057     /*
1058     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1059       nofinalvalue = 1;
1060     }
1061     */
1062     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1063     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1064     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1065 #if defined(PETSC_USE_COMPLEX)
1066     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1067 #else
1068     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1069 #endif
1070     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1071 
1072     for (i = 0; i < m; i++) {
1073       shift = a->sliidx[i >> 3] + (i & 0x07);
1074       for (j = 0; j < a->rlen[i]; j++) {
1075 #if defined(PETSC_USE_COMPLEX)
1076         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1077 #else
1078         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j]));
1079 #endif
1080       }
1081     }
1082     /*
1083     if (nofinalvalue) {
1084 #if defined(PETSC_USE_COMPLEX)
1085       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1086 #else
1087       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1088 #endif
1089     }
1090     */
1091     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1092     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1093     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1094   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1095     PetscFunctionReturn(PETSC_SUCCESS);
1096   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1097     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1098     for (i = 0; i < m; i++) {
1099       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1100       shift = a->sliidx[i >> 3] + (i & 0x07);
1101       for (j = 0; j < a->rlen[i]; j++) {
1102 #if defined(PETSC_USE_COMPLEX)
1103         if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1104           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1105         } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1106           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1107         } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1108           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1109         }
1110 #else
1111         if (a->val[shift + 8 * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1112 #endif
1113       }
1114       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1115     }
1116     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1117   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1118     PetscInt    cnt = 0, jcnt;
1119     PetscScalar value;
1120 #if defined(PETSC_USE_COMPLEX)
1121     PetscBool realonly = PETSC_TRUE;
1122     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1123       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1124         realonly = PETSC_FALSE;
1125         break;
1126       }
1127     }
1128 #endif
1129 
1130     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1131     for (i = 0; i < m; i++) {
1132       jcnt  = 0;
1133       shift = a->sliidx[i >> 3] + (i & 0x07);
1134       for (j = 0; j < A->cmap->n; j++) {
1135         if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) {
1136           value = a->val[cnt++];
1137           jcnt++;
1138         } else {
1139           value = 0.0;
1140         }
1141 #if defined(PETSC_USE_COMPLEX)
1142         if (realonly) {
1143           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1144         } else {
1145           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1146         }
1147 #else
1148         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1149 #endif
1150       }
1151       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1152     }
1153     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1154   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1155     PetscInt fshift = 1;
1156     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1157 #if defined(PETSC_USE_COMPLEX)
1158     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1159 #else
1160     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1161 #endif
1162     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1163     for (i = 0; i < m; i++) {
1164       shift = a->sliidx[i >> 3] + (i & 0x07);
1165       for (j = 0; j < a->rlen[i]; j++) {
1166 #if defined(PETSC_USE_COMPLEX)
1167         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1168 #else
1169         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j]));
1170 #endif
1171       }
1172     }
1173     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1174   } else if (format == PETSC_VIEWER_NATIVE) {
1175     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1176       PetscInt row;
1177       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1178       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
1179 #if defined(PETSC_USE_COMPLEX)
1180         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1181           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1182         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1183           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1184         } else {
1185           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1186         }
1187 #else
1188         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j]));
1189 #endif
1190       }
1191     }
1192   } else {
1193     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1194     if (A->factortype) {
1195       for (i = 0; i < m; i++) {
1196         shift = a->sliidx[i >> 3] + (i & 0x07);
1197         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1198         /* L part */
1199         for (j = shift; j < a->diag[i]; j += 8) {
1200 #if defined(PETSC_USE_COMPLEX)
1201           if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) {
1202             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1203           } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) {
1204             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1205           } else {
1206             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1207           }
1208 #else
1209           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1210 #endif
1211         }
1212         /* diagonal */
1213         j = a->diag[i];
1214 #if defined(PETSC_USE_COMPLEX)
1215         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1216           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1217         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1218           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1219         } else {
1220           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1221         }
1222 #else
1223         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1224 #endif
1225 
1226         /* U part */
1227         for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) {
1228 #if defined(PETSC_USE_COMPLEX)
1229           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1230             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1231           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1232             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1233           } else {
1234             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1235           }
1236 #else
1237           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1238 #endif
1239         }
1240         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1241       }
1242     } else {
1243       for (i = 0; i < m; i++) {
1244         shift = a->sliidx[i >> 3] + (i & 0x07);
1245         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1246         for (j = 0; j < a->rlen[i]; j++) {
1247 #if defined(PETSC_USE_COMPLEX)
1248           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1249             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1250           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1251             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1252           } else {
1253             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1254           }
1255 #else
1256           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1257 #endif
1258         }
1259         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1260       }
1261     }
1262     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1263   }
1264   PetscCall(PetscViewerFlush(viewer));
1265   PetscFunctionReturn(PETSC_SUCCESS);
1266 }
1267 
1268 #include <petscdraw.h>
1269 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1270 {
1271   Mat               A = (Mat)Aa;
1272   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1273   PetscInt          i, j, m = A->rmap->n, shift;
1274   int               color;
1275   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1276   PetscViewer       viewer;
1277   PetscViewerFormat format;
1278 
1279   PetscFunctionBegin;
1280   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1281   PetscCall(PetscViewerGetFormat(viewer, &format));
1282   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1283 
1284   /* loop over matrix elements drawing boxes */
1285 
1286   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1287     PetscDrawCollectiveBegin(draw);
1288     /* Blue for negative, Cyan for zero and  Red for positive */
1289     color = PETSC_DRAW_BLUE;
1290     for (i = 0; i < m; i++) {
1291       shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1292       y_l   = m - i - 1.0;
1293       y_r   = y_l + 1.0;
1294       for (j = 0; j < a->rlen[i]; j++) {
1295         x_l = a->colidx[shift + j * 8];
1296         x_r = x_l + 1.0;
1297         if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue;
1298         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1299       }
1300     }
1301     color = PETSC_DRAW_CYAN;
1302     for (i = 0; i < m; i++) {
1303       shift = a->sliidx[i >> 3] + (i & 0x07);
1304       y_l   = m - i - 1.0;
1305       y_r   = y_l + 1.0;
1306       for (j = 0; j < a->rlen[i]; j++) {
1307         x_l = a->colidx[shift + j * 8];
1308         x_r = x_l + 1.0;
1309         if (a->val[shift + 8 * j] != 0.) continue;
1310         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1311       }
1312     }
1313     color = PETSC_DRAW_RED;
1314     for (i = 0; i < m; i++) {
1315       shift = a->sliidx[i >> 3] + (i & 0x07);
1316       y_l   = m - i - 1.0;
1317       y_r   = y_l + 1.0;
1318       for (j = 0; j < a->rlen[i]; j++) {
1319         x_l = a->colidx[shift + j * 8];
1320         x_r = x_l + 1.0;
1321         if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue;
1322         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1323       }
1324     }
1325     PetscDrawCollectiveEnd(draw);
1326   } else {
1327     /* use contour shading to indicate magnitude of values */
1328     /* first determine max of all nonzero values */
1329     PetscReal minv = 0.0, maxv = 0.0;
1330     PetscInt  count = 0;
1331     PetscDraw popup;
1332     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1333       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1334     }
1335     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1336     PetscCall(PetscDrawGetPopup(draw, &popup));
1337     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1338 
1339     PetscDrawCollectiveBegin(draw);
1340     for (i = 0; i < m; i++) {
1341       shift = a->sliidx[i >> 3] + (i & 0x07);
1342       y_l   = m - i - 1.0;
1343       y_r   = y_l + 1.0;
1344       for (j = 0; j < a->rlen[i]; j++) {
1345         x_l   = a->colidx[shift + j * 8];
1346         x_r   = x_l + 1.0;
1347         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1348         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1349         count++;
1350       }
1351     }
1352     PetscDrawCollectiveEnd(draw);
1353   }
1354   PetscFunctionReturn(PETSC_SUCCESS);
1355 }
1356 
1357 #include <petscdraw.h>
1358 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1359 {
1360   PetscDraw draw;
1361   PetscReal xr, yr, xl, yl, h, w;
1362   PetscBool isnull;
1363 
1364   PetscFunctionBegin;
1365   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1366   PetscCall(PetscDrawIsNull(draw, &isnull));
1367   if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1368 
1369   xr = A->cmap->n;
1370   yr = A->rmap->n;
1371   h  = yr / 10.0;
1372   w  = xr / 10.0;
1373   xr += w;
1374   yr += h;
1375   xl = -w;
1376   yl = -h;
1377   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1378   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1379   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1380   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1381   PetscCall(PetscDrawSave(draw));
1382   PetscFunctionReturn(PETSC_SUCCESS);
1383 }
1384 
1385 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1386 {
1387   PetscBool iascii, isbinary, isdraw;
1388 
1389   PetscFunctionBegin;
1390   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1391   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1392   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1393   if (iascii) {
1394     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1395   } else if (isbinary) {
1396     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1397   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1398   PetscFunctionReturn(PETSC_SUCCESS);
1399 }
1400 
1401 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1402 {
1403   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1404   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1405   MatScalar   *vp;
1406 
1407   PetscFunctionBegin;
1408   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1409   /* To do: compress out the unused elements */
1410   PetscCall(MatMarkDiagonal_SeqSELL(A));
1411   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1412   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1413   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1414   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1415   for (i = 0; i < a->totalslices; ++i) {
1416     shift = a->sliidx[i];                                      /* starting index of the slice */
1417     cp    = a->colidx + shift;                                 /* pointer to the column indices of the slice */
1418     vp    = a->val + shift;                                    /* pointer to the nonzero values of the slice */
1419     for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */
1420       row  = 8 * i + row_in_slice;
1421       nrow = a->rlen[row]; /* number of nonzeros in row */
1422       /*
1423         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1424         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1425       */
1426       lastcol = 0;
1427       if (nrow > 0) {                                /* nonempty row */
1428         lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1429       } else if (!row_in_slice) {                    /* first row of the currect slice is empty */
1430         for (j = 1; j < 8; j++) {
1431           if (a->rlen[8 * i + j]) {
1432             lastcol = cp[j];
1433             break;
1434           }
1435         }
1436       } else {
1437         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1438       }
1439 
1440       for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) {
1441         cp[8 * k + row_in_slice] = lastcol;
1442         vp[8 * k + row_in_slice] = (MatScalar)0;
1443       }
1444     }
1445   }
1446 
1447   A->info.mallocs += a->reallocs;
1448   a->reallocs = 0;
1449 
1450   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1451   PetscFunctionReturn(PETSC_SUCCESS);
1452 }
1453 
1454 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1455 {
1456   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1457 
1458   PetscFunctionBegin;
1459   info->block_size   = 1.0;
1460   info->nz_allocated = a->maxallocmat;
1461   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1462   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1463   info->assemblies   = A->num_ass;
1464   info->mallocs      = A->info.mallocs;
1465   info->memory       = 0; /* REVIEW ME */
1466   if (A->factortype) {
1467     info->fill_ratio_given  = A->info.fill_ratio_given;
1468     info->fill_ratio_needed = A->info.fill_ratio_needed;
1469     info->factor_mallocs    = A->info.factor_mallocs;
1470   } else {
1471     info->fill_ratio_given  = 0;
1472     info->fill_ratio_needed = 0;
1473     info->factor_mallocs    = 0;
1474   }
1475   PetscFunctionReturn(PETSC_SUCCESS);
1476 }
1477 
1478 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1479 {
1480   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1481   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1482   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1483   MatScalar   *vp, value;
1484 
1485   PetscFunctionBegin;
1486   for (k = 0; k < m; k++) { /* loop over added rows */
1487     row = im[k];
1488     if (row < 0) continue;
1489     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1490     shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1491     cp    = a->colidx + shift;                  /* pointer to the row */
1492     vp    = a->val + shift;                     /* pointer to the row */
1493     nrow  = a->rlen[row];
1494     low   = 0;
1495     high  = nrow;
1496 
1497     for (l = 0; l < n; l++) { /* loop over added columns */
1498       col = in[l];
1499       if (col < 0) continue;
1500       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1501       if (a->roworiented) {
1502         value = v[l + k * n];
1503       } else {
1504         value = v[k + l * m];
1505       }
1506       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1507 
1508       /* search in this row for the specified column, i indicates the column to be set */
1509       if (col <= lastcol) low = 0;
1510       else high = nrow;
1511       lastcol = col;
1512       while (high - low > 5) {
1513         t = (low + high) / 2;
1514         if (*(cp + t * 8) > col) high = t;
1515         else low = t;
1516       }
1517       for (i = low; i < high; i++) {
1518         if (*(cp + i * 8) > col) break;
1519         if (*(cp + i * 8) == col) {
1520           if (is == ADD_VALUES) *(vp + i * 8) += value;
1521           else *(vp + i * 8) = value;
1522           low = i + 1;
1523           goto noinsert;
1524         }
1525       }
1526       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1527       if (nonew == 1) goto noinsert;
1528       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1529       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1530       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar);
1531       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1532       for (ii = nrow - 1; ii >= i; ii--) {
1533         *(cp + (ii + 1) * 8) = *(cp + ii * 8);
1534         *(vp + (ii + 1) * 8) = *(vp + ii * 8);
1535       }
1536       a->rlen[row]++;
1537       *(cp + i * 8) = col;
1538       *(vp + i * 8) = value;
1539       a->nz++;
1540       A->nonzerostate++;
1541       low = i + 1;
1542       high++;
1543       nrow++;
1544     noinsert:;
1545     }
1546     a->rlen[row] = nrow;
1547   }
1548   PetscFunctionReturn(PETSC_SUCCESS);
1549 }
1550 
1551 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1552 {
1553   PetscFunctionBegin;
1554   /* If the two matrices have the same copy implementation, use fast copy. */
1555   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1556     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1557     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1558 
1559     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1560     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1561   } else {
1562     PetscCall(MatCopy_Basic(A, B, str));
1563   }
1564   PetscFunctionReturn(PETSC_SUCCESS);
1565 }
1566 
1567 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1568 {
1569   PetscFunctionBegin;
1570   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1571   PetscFunctionReturn(PETSC_SUCCESS);
1572 }
1573 
1574 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1575 {
1576   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1577 
1578   PetscFunctionBegin;
1579   *array = a->val;
1580   PetscFunctionReturn(PETSC_SUCCESS);
1581 }
1582 
1583 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1584 {
1585   PetscFunctionBegin;
1586   PetscFunctionReturn(PETSC_SUCCESS);
1587 }
1588 
1589 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1590 {
1591   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1592   PetscInt     i;
1593   MatScalar   *aval = a->val;
1594 
1595   PetscFunctionBegin;
1596   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1597   PetscFunctionReturn(PETSC_SUCCESS);
1598 }
1599 
1600 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1601 {
1602   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1603   PetscInt     i;
1604   MatScalar   *aval = a->val;
1605 
1606   PetscFunctionBegin;
1607   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1608   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1609   PetscFunctionReturn(PETSC_SUCCESS);
1610 }
1611 
1612 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1613 {
1614   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1615   MatScalar   *aval   = a->val;
1616   PetscScalar  oalpha = alpha;
1617   PetscBLASInt one    = 1, size;
1618 
1619   PetscFunctionBegin;
1620   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1621   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1622   PetscCall(PetscLogFlops(a->nz));
1623   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1624   PetscFunctionReturn(PETSC_SUCCESS);
1625 }
1626 
1627 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1628 {
1629   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1630 
1631   PetscFunctionBegin;
1632   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1633   PetscCall(MatShift_Basic(Y, a));
1634   PetscFunctionReturn(PETSC_SUCCESS);
1635 }
1636 
1637 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1638 {
1639   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1640   PetscScalar       *x, sum, *t;
1641   const MatScalar   *idiag = NULL, *mdiag;
1642   const PetscScalar *b, *xb;
1643   PetscInt           n, m = A->rmap->n, i, j, shift;
1644   const PetscInt    *diag;
1645 
1646   PetscFunctionBegin;
1647   its = its * lits;
1648 
1649   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1650   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1651   a->fshift = fshift;
1652   a->omega  = omega;
1653 
1654   diag  = a->diag;
1655   t     = a->ssor_work;
1656   idiag = a->idiag;
1657   mdiag = a->mdiag;
1658 
1659   PetscCall(VecGetArray(xx, &x));
1660   PetscCall(VecGetArrayRead(bb, &b));
1661   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1662   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1663   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1664   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1665 
1666   if (flag & SOR_ZERO_INITIAL_GUESS) {
1667     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1668       for (i = 0; i < m; i++) {
1669         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1670         sum   = b[i];
1671         n     = (diag[i] - shift) / 8;
1672         for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1673         t[i] = sum;
1674         x[i] = sum * idiag[i];
1675       }
1676       xb = t;
1677       PetscCall(PetscLogFlops(a->nz));
1678     } else xb = b;
1679     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1680       for (i = m - 1; i >= 0; i--) {
1681         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1682         sum   = xb[i];
1683         n     = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1684         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1685         if (xb == b) {
1686           x[i] = sum * idiag[i];
1687         } else {
1688           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1689         }
1690       }
1691       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1692     }
1693     its--;
1694   }
1695   while (its--) {
1696     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1697       for (i = 0; i < m; i++) {
1698         /* lower */
1699         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1700         sum   = b[i];
1701         n     = (diag[i] - shift) / 8;
1702         for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1703         t[i] = sum; /* save application of the lower-triangular part */
1704         /* upper */
1705         n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1706         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1707         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1708       }
1709       xb = t;
1710       PetscCall(PetscLogFlops(2.0 * a->nz));
1711     } else xb = b;
1712     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1713       for (i = m - 1; i >= 0; i--) {
1714         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1715         sum   = xb[i];
1716         if (xb == b) {
1717           /* whole matrix (no checkpointing available) */
1718           n = a->rlen[i];
1719           for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1720           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1721         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1722           n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1723           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1724           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1725         }
1726       }
1727       if (xb == b) {
1728         PetscCall(PetscLogFlops(2.0 * a->nz));
1729       } else {
1730         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1731       }
1732     }
1733   }
1734   PetscCall(VecRestoreArray(xx, &x));
1735   PetscCall(VecRestoreArrayRead(bb, &b));
1736   PetscFunctionReturn(PETSC_SUCCESS);
1737 }
1738 
1739 /* -------------------------------------------------------------------*/
1740 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1741                                        MatGetRow_SeqSELL,
1742                                        MatRestoreRow_SeqSELL,
1743                                        MatMult_SeqSELL,
1744                                        /* 4*/ MatMultAdd_SeqSELL,
1745                                        MatMultTranspose_SeqSELL,
1746                                        MatMultTransposeAdd_SeqSELL,
1747                                        NULL,
1748                                        NULL,
1749                                        NULL,
1750                                        /* 10*/ NULL,
1751                                        NULL,
1752                                        NULL,
1753                                        MatSOR_SeqSELL,
1754                                        NULL,
1755                                        /* 15*/ MatGetInfo_SeqSELL,
1756                                        MatEqual_SeqSELL,
1757                                        MatGetDiagonal_SeqSELL,
1758                                        MatDiagonalScale_SeqSELL,
1759                                        NULL,
1760                                        /* 20*/ NULL,
1761                                        MatAssemblyEnd_SeqSELL,
1762                                        MatSetOption_SeqSELL,
1763                                        MatZeroEntries_SeqSELL,
1764                                        /* 24*/ NULL,
1765                                        NULL,
1766                                        NULL,
1767                                        NULL,
1768                                        NULL,
1769                                        /* 29*/ MatSetUp_SeqSELL,
1770                                        NULL,
1771                                        NULL,
1772                                        NULL,
1773                                        NULL,
1774                                        /* 34*/ MatDuplicate_SeqSELL,
1775                                        NULL,
1776                                        NULL,
1777                                        NULL,
1778                                        NULL,
1779                                        /* 39*/ NULL,
1780                                        NULL,
1781                                        NULL,
1782                                        MatGetValues_SeqSELL,
1783                                        MatCopy_SeqSELL,
1784                                        /* 44*/ NULL,
1785                                        MatScale_SeqSELL,
1786                                        MatShift_SeqSELL,
1787                                        NULL,
1788                                        NULL,
1789                                        /* 49*/ NULL,
1790                                        NULL,
1791                                        NULL,
1792                                        NULL,
1793                                        NULL,
1794                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1795                                        NULL,
1796                                        NULL,
1797                                        NULL,
1798                                        NULL,
1799                                        /* 59*/ NULL,
1800                                        MatDestroy_SeqSELL,
1801                                        MatView_SeqSELL,
1802                                        NULL,
1803                                        NULL,
1804                                        /* 64*/ NULL,
1805                                        NULL,
1806                                        NULL,
1807                                        NULL,
1808                                        NULL,
1809                                        /* 69*/ NULL,
1810                                        NULL,
1811                                        NULL,
1812                                        NULL,
1813                                        NULL,
1814                                        /* 74*/ NULL,
1815                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1816                                        NULL,
1817                                        NULL,
1818                                        NULL,
1819                                        /* 79*/ NULL,
1820                                        NULL,
1821                                        NULL,
1822                                        NULL,
1823                                        NULL,
1824                                        /* 84*/ NULL,
1825                                        NULL,
1826                                        NULL,
1827                                        NULL,
1828                                        NULL,
1829                                        /* 89*/ NULL,
1830                                        NULL,
1831                                        NULL,
1832                                        NULL,
1833                                        NULL,
1834                                        /* 94*/ NULL,
1835                                        NULL,
1836                                        NULL,
1837                                        NULL,
1838                                        NULL,
1839                                        /* 99*/ NULL,
1840                                        NULL,
1841                                        NULL,
1842                                        MatConjugate_SeqSELL,
1843                                        NULL,
1844                                        /*104*/ NULL,
1845                                        NULL,
1846                                        NULL,
1847                                        NULL,
1848                                        NULL,
1849                                        /*109*/ NULL,
1850                                        NULL,
1851                                        NULL,
1852                                        NULL,
1853                                        MatMissingDiagonal_SeqSELL,
1854                                        /*114*/ NULL,
1855                                        NULL,
1856                                        NULL,
1857                                        NULL,
1858                                        NULL,
1859                                        /*119*/ NULL,
1860                                        NULL,
1861                                        NULL,
1862                                        NULL,
1863                                        NULL,
1864                                        /*124*/ NULL,
1865                                        NULL,
1866                                        NULL,
1867                                        NULL,
1868                                        NULL,
1869                                        /*129*/ NULL,
1870                                        NULL,
1871                                        NULL,
1872                                        NULL,
1873                                        NULL,
1874                                        /*134*/ NULL,
1875                                        NULL,
1876                                        NULL,
1877                                        NULL,
1878                                        NULL,
1879                                        /*139*/ NULL,
1880                                        NULL,
1881                                        NULL,
1882                                        MatFDColoringSetUp_SeqXAIJ,
1883                                        NULL,
1884                                        /*144*/ NULL,
1885                                        NULL,
1886                                        NULL,
1887                                        NULL,
1888                                        NULL,
1889                                        NULL,
1890                                        /*150*/ NULL,
1891                                        NULL};
1892 
1893 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1894 {
1895   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1896 
1897   PetscFunctionBegin;
1898   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1899 
1900   /* allocate space for values if not already there */
1901   if (!a->saved_values) { PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); }
1902 
1903   /* copy values over */
1904   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1905   PetscFunctionReturn(PETSC_SUCCESS);
1906 }
1907 
1908 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1909 {
1910   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1911 
1912   PetscFunctionBegin;
1913   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1914   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1915   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1916   PetscFunctionReturn(PETSC_SUCCESS);
1917 }
1918 
1919 /*@C
1920  MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()`
1921 
1922  Not Collective
1923 
1924  Input Parameters:
1925  .  mat - a `MATSEQSELL` matrix
1926  .  array - pointer to the data
1927 
1928  Level: intermediate
1929 
1930  .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
1931  @*/
1932 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array)
1933 {
1934   PetscFunctionBegin;
1935   PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
1936   PetscFunctionReturn(PETSC_SUCCESS);
1937 }
1938 
1939 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1940 {
1941   Mat_SeqSELL *b;
1942   PetscMPIInt  size;
1943 
1944   PetscFunctionBegin;
1945   PetscCall(PetscCitationsRegister(citation, &cited));
1946   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
1947   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
1948 
1949   PetscCall(PetscNew(&b));
1950 
1951   B->data = (void *)b;
1952 
1953   PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps)));
1954 
1955   b->row                = NULL;
1956   b->col                = NULL;
1957   b->icol               = NULL;
1958   b->reallocs           = 0;
1959   b->ignorezeroentries  = PETSC_FALSE;
1960   b->roworiented        = PETSC_TRUE;
1961   b->nonew              = 0;
1962   b->diag               = NULL;
1963   b->solve_work         = NULL;
1964   B->spptr              = NULL;
1965   b->saved_values       = NULL;
1966   b->idiag              = NULL;
1967   b->mdiag              = NULL;
1968   b->ssor_work          = NULL;
1969   b->omega              = 1.0;
1970   b->fshift             = 0.0;
1971   b->idiagvalid         = PETSC_FALSE;
1972   b->keepnonzeropattern = PETSC_FALSE;
1973 
1974   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
1975   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
1976   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
1977   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
1978   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
1979   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
1980   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
1981   PetscFunctionReturn(PETSC_SUCCESS);
1982 }
1983 
1984 /*
1985  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1986  */
1987 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
1988 {
1989   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
1990   PetscInt     i, m                           = A->rmap->n;
1991   PetscInt     totalslices = a->totalslices;
1992 
1993   PetscFunctionBegin;
1994   C->factortype = A->factortype;
1995   c->row        = NULL;
1996   c->col        = NULL;
1997   c->icol       = NULL;
1998   c->reallocs   = 0;
1999   C->assembled  = PETSC_TRUE;
2000 
2001   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2002   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2003 
2004   PetscCall(PetscMalloc1(8 * totalslices, &c->rlen));
2005   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2006 
2007   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2008   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2009 
2010   /* allocate the matrix space */
2011   if (mallocmatspace) {
2012     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2013 
2014     c->singlemalloc = PETSC_TRUE;
2015 
2016     if (m > 0) {
2017       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2018       if (cpvalues == MAT_COPY_VALUES) {
2019         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2020       } else {
2021         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2022       }
2023     }
2024   }
2025 
2026   c->ignorezeroentries = a->ignorezeroentries;
2027   c->roworiented       = a->roworiented;
2028   c->nonew             = a->nonew;
2029   if (a->diag) {
2030     PetscCall(PetscMalloc1(m, &c->diag));
2031     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2032   } else c->diag = NULL;
2033 
2034   c->solve_work         = NULL;
2035   c->saved_values       = NULL;
2036   c->idiag              = NULL;
2037   c->ssor_work          = NULL;
2038   c->keepnonzeropattern = a->keepnonzeropattern;
2039   c->free_val           = PETSC_TRUE;
2040   c->free_colidx        = PETSC_TRUE;
2041 
2042   c->maxallocmat  = a->maxallocmat;
2043   c->maxallocrow  = a->maxallocrow;
2044   c->rlenmax      = a->rlenmax;
2045   c->nz           = a->nz;
2046   C->preallocated = PETSC_TRUE;
2047 
2048   c->nonzerorowcnt = a->nonzerorowcnt;
2049   C->nonzerostate  = A->nonzerostate;
2050 
2051   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2052   PetscFunctionReturn(PETSC_SUCCESS);
2053 }
2054 
2055 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2056 {
2057   PetscFunctionBegin;
2058   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2059   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2060   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2061   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2062   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2063   PetscFunctionReturn(PETSC_SUCCESS);
2064 }
2065 
2066 /*MC
2067    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2068    based on the sliced Ellpack format
2069 
2070    Options Database Keys:
2071 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2072 
2073    Level: beginner
2074 
2075 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2076 M*/
2077 
2078 /*MC
2079    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2080 
2081    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2082    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2083   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2084   for communicators controlling multiple processes.  It is recommended that you call both of
2085   the above preallocation routines for simplicity.
2086 
2087    Options Database Keys:
2088 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2089 
2090   Level: beginner
2091 
2092   Notes:
2093    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2094 
2095    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2096    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2097 
2098   Developer Notes:
2099    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2100 
2101    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2102 .vb
2103                             (2 0  3 4)
2104    Consider the matrix A =  (5 0  6 0)
2105                             (0 0  7 8)
2106                             (0 0  9 9)
2107 
2108    symbolically the Ellpack format can be written as
2109 
2110         (2 3 4 |)           (0 2 3 |)
2111    v =  (5 6 0 |)  colidx = (0 2 2 |)
2112         --------            ---------
2113         (7 8 |)             (2 3 |)
2114         (9 9 |)             (2 3 |)
2115 
2116     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2117     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2118     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2119 
2120     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2121 
2122 .ve
2123 
2124       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2125 
2126  References:
2127 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2128    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2129 
2130 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2131 M*/
2132 
2133 /*@C
2134        MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2135 
2136  Collective on comm
2137 
2138  Input Parameters:
2139 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
2140 .  m - number of rows
2141 .  n - number of columns
2142 .  rlenmax - maximum number of nonzeros in a row
2143 -  rlen - array containing the number of nonzeros in the various rows
2144  (possibly different for each row) or NULL
2145 
2146  Output Parameter:
2147 .  A - the matrix
2148 
2149  It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2150  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2151  [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2152 
2153  Notes:
2154  If nnz is given then nz is ignored
2155 
2156  Specify the preallocated storage with either rlenmax or rlen (not both).
2157  Set rlenmax = `PETSC_DEFAULT` and rlen = NULL for PETSc to control dynamic memory
2158  allocation.  For large problems you MUST preallocate memory or you
2159  will get TERRIBLE performance, see the users' manual chapter on matrices.
2160 
2161  Level: intermediate
2162 
2163  .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2164  @*/
2165 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt maxallocrow, const PetscInt rlen[], Mat *A)
2166 {
2167   PetscFunctionBegin;
2168   PetscCall(MatCreate(comm, A));
2169   PetscCall(MatSetSizes(*A, m, n, m, n));
2170   PetscCall(MatSetType(*A, MATSEQSELL));
2171   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, maxallocrow, rlen));
2172   PetscFunctionReturn(PETSC_SUCCESS);
2173 }
2174 
2175 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2176 {
2177   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2178   PetscInt     totalslices = a->totalslices;
2179 
2180   PetscFunctionBegin;
2181   /* If the  matrix dimensions are not equal,or no of nonzeros */
2182   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2183     *flg = PETSC_FALSE;
2184     PetscFunctionReturn(PETSC_SUCCESS);
2185   }
2186   /* if the a->colidx are the same */
2187   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2188   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2189   /* if a->val are the same */
2190   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2191   PetscFunctionReturn(PETSC_SUCCESS);
2192 }
2193 
2194 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2195 {
2196   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2197 
2198   PetscFunctionBegin;
2199   a->idiagvalid = PETSC_FALSE;
2200   PetscFunctionReturn(PETSC_SUCCESS);
2201 }
2202 
2203 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2204 {
2205 #if defined(PETSC_USE_COMPLEX)
2206   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2207   PetscInt     i;
2208   PetscScalar *val = a->val;
2209 
2210   PetscFunctionBegin;
2211   for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]);
2212 #else
2213   PetscFunctionBegin;
2214 #endif
2215   PetscFunctionReturn(PETSC_SUCCESS);
2216 }
2217