xref: /petsc/src/mat/impls/sell/seq/sell.c (revision d8e4f26e7eadd05aa3d9f3d0f2220b3e914971d9)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited      = PETSC_FALSE;
10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14                                " year = 2018\n"
15                                "}\n";
16 
17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
18 
19   #include <immintrin.h>
20 
21   #if !defined(_MM_SCALE_8)
22     #define _MM_SCALE_8 8
23   #endif
24 
25   #if defined(__AVX512F__)
26     /* these do not work
27    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29   */
30     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
33       vec_vals = _mm512_loadu_pd(aval); \
34       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36   #elif defined(__AVX2__) && defined(__FMA__)
37     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38       vec_vals = _mm256_loadu_pd(aval); \
39       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42   #endif
43 #endif /* PETSC_HAVE_IMMINTRIN_H */
44 
45 /*@C
46  MatSeqSELLSetPreallocation - For good matrix assembly performance
47  the user should preallocate the matrix storage by setting the parameter nz
48  (or the array nnz).  By setting these parameters accurately, performance
49  during matrix assembly can be increased significantly.
50 
51  Collective
52 
53  Input Parameters:
54  +  B - The `MATSEQSELL` matrix
55  .  rlenmax - number of nonzeros per row (same for all rows)
56  -  rlen - array containing the number of nonzeros in the various rows
57  (possibly different for each row) or `NULL`
58 
59  Level: intermediate
60 
61  Notes:
62  If `rlen` is given then `rlenmax` is ignored.
63 
64  Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
65  Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
66  allocation.  For large problems you MUST preallocate memory or you
67  will get TERRIBLE performance, see the users' manual chapter on matrices.
68 
69  You can call `MatGetInfo()` to get information on how effective the preallocation was;
70  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
71  You can also run with the option `-info` and look for messages with the string
72  malloc in them to see if additional memory allocation was needed.
73 
74  Developers Note:
75  Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
76  entries or columns indices.
77 
78  The maximum number of nonzeos in any row should be as accurate as possible.
79  If it is underestimated, you will get bad performance due to reallocation
80  (`MatSeqXSELLReallocateSELL()`).
81 
82  .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
83  @*/
84 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
85 {
86   PetscFunctionBegin;
87   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
88   PetscValidType(B, 1);
89   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
90   PetscFunctionReturn(0);
91 }
92 
93 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
94 {
95   Mat_SeqSELL *b;
96   PetscInt     i, j, totalslices;
97   PetscBool    skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
98 
99   PetscFunctionBegin;
100   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
101   if (maxallocrow == MAT_SKIP_ALLOCATION) {
102     skipallocation = PETSC_TRUE;
103     maxallocrow    = 0;
104   }
105 
106   PetscCall(PetscLayoutSetUp(B->rmap));
107   PetscCall(PetscLayoutSetUp(B->cmap));
108 
109   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
110   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
111   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
112   if (rlen) {
113     for (i = 0; i < B->rmap->n; i++) {
114       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
115       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
116     }
117   }
118 
119   B->preallocated = PETSC_TRUE;
120 
121   b = (Mat_SeqSELL *)B->data;
122 
123   totalslices    = PetscCeilInt(B->rmap->n, 8);
124   b->totalslices = totalslices;
125   if (!skipallocation) {
126     if (B->rmap->n & 0x07) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n));
127 
128     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
129       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
130     }
131     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
132       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
133       else if (maxallocrow < 0) maxallocrow = 1;
134       for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow;
135     } else {
136       maxallocrow  = 0;
137       b->sliidx[0] = 0;
138       for (i = 1; i < totalslices; i++) {
139         b->sliidx[i] = 0;
140         for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]);
141         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
142         PetscCall(PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i]));
143       }
144       /* last slice */
145       b->sliidx[totalslices] = 0;
146       for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
147       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
148       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices];
149     }
150 
151     /* allocate space for val, colidx, rlen */
152     /* FIXME: should B's old memory be unlogged? */
153     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
154     /* FIXME: assuming an element of the bit array takes 8 bits */
155     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
156     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
157     PetscCall(PetscCalloc1(8 * totalslices, &b->rlen));
158 
159     b->singlemalloc = PETSC_TRUE;
160     b->free_val     = PETSC_TRUE;
161     b->free_colidx  = PETSC_TRUE;
162   } else {
163     b->free_val    = PETSC_FALSE;
164     b->free_colidx = PETSC_FALSE;
165   }
166 
167   b->nz               = 0;
168   b->maxallocrow      = maxallocrow;
169   b->rlenmax          = maxallocrow;
170   b->maxallocmat      = b->sliidx[totalslices];
171   B->info.nz_unneeded = (double)b->maxallocmat;
172   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
173   PetscFunctionReturn(0);
174 }
175 
176 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
177 {
178   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
179   PetscInt     shift;
180 
181   PetscFunctionBegin;
182   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
183   if (nz) *nz = a->rlen[row];
184   shift = a->sliidx[row >> 3] + (row & 0x07);
185   if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals));
186   if (idx) {
187     PetscInt j;
188     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j];
189     *idx = a->getrowcols;
190   }
191   if (v) {
192     PetscInt j;
193     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j];
194     *v = a->getrowvals;
195   }
196   PetscFunctionReturn(0);
197 }
198 
199 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
200 {
201   PetscFunctionBegin;
202   PetscFunctionReturn(0);
203 }
204 
205 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
206 {
207   Mat          B;
208   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
209   PetscInt     i;
210 
211   PetscFunctionBegin;
212   if (reuse == MAT_REUSE_MATRIX) {
213     B = *newmat;
214     PetscCall(MatZeroEntries(B));
215   } else {
216     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
217     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
218     PetscCall(MatSetType(B, MATSEQAIJ));
219     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
220   }
221 
222   for (i = 0; i < A->rmap->n; i++) {
223     PetscInt     nz = 0, *cols = NULL;
224     PetscScalar *vals = NULL;
225 
226     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
227     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
228     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
229   }
230 
231   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
232   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
233   B->rmap->bs = A->rmap->bs;
234 
235   if (reuse == MAT_INPLACE_MATRIX) {
236     PetscCall(MatHeaderReplace(A, &B));
237   } else {
238     *newmat = B;
239   }
240   PetscFunctionReturn(0);
241 }
242 
243 #include <../src/mat/impls/aij/seq/aij.h>
244 
245 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
246 {
247   Mat                B;
248   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
249   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
250   const PetscInt    *cols;
251   const PetscScalar *vals;
252 
253   PetscFunctionBegin;
254 
255   if (reuse == MAT_REUSE_MATRIX) {
256     B = *newmat;
257   } else {
258     if (PetscDefined(USE_DEBUG) || !a->ilen) {
259       PetscCall(PetscMalloc1(m, &rowlengths));
260       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
261     }
262     if (PetscDefined(USE_DEBUG) && a->ilen) {
263       PetscBool eq;
264       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
265       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
266       PetscCall(PetscFree(rowlengths));
267       rowlengths = a->ilen;
268     } else if (a->ilen) rowlengths = a->ilen;
269     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
270     PetscCall(MatSetSizes(B, m, n, m, n));
271     PetscCall(MatSetType(B, MATSEQSELL));
272     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
273     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
274   }
275 
276   for (row = 0; row < m; row++) {
277     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
278     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
279     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
280   }
281   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
282   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
283   B->rmap->bs = A->rmap->bs;
284 
285   if (reuse == MAT_INPLACE_MATRIX) {
286     PetscCall(MatHeaderReplace(A, &B));
287   } else {
288     *newmat = B;
289   }
290   PetscFunctionReturn(0);
291 }
292 
293 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
294 {
295   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
296   PetscScalar       *y;
297   const PetscScalar *x;
298   const MatScalar   *aval        = a->val;
299   PetscInt           totalslices = a->totalslices;
300   const PetscInt    *acolidx     = a->colidx;
301   PetscInt           i, j;
302 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
303   __m512d  vec_x, vec_y, vec_vals;
304   __m256i  vec_idx;
305   __mmask8 mask;
306   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
307   __m256i  vec_idx2, vec_idx3, vec_idx4;
308 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
309   __m128i   vec_idx;
310   __m256d   vec_x, vec_y, vec_y2, vec_vals;
311   MatScalar yval;
312   PetscInt  r, rows_left, row, nnz_in_row;
313 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
314   __m128d   vec_x_tmp;
315   __m256d   vec_x, vec_y, vec_y2, vec_vals;
316   MatScalar yval;
317   PetscInt  r, rows_left, row, nnz_in_row;
318 #else
319   PetscScalar sum[8];
320 #endif
321 
322 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
323   #pragma disjoint(*x, *y, *aval)
324 #endif
325 
326   PetscFunctionBegin;
327   PetscCall(VecGetArrayRead(xx, &x));
328   PetscCall(VecGetArray(yy, &y));
329 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
330   for (i = 0; i < totalslices; i++) { /* loop over slices */
331     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
332     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
333 
334     vec_y  = _mm512_setzero_pd();
335     vec_y2 = _mm512_setzero_pd();
336     vec_y3 = _mm512_setzero_pd();
337     vec_y4 = _mm512_setzero_pd();
338 
339     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
340     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
341     case 3:
342       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
343       acolidx += 8;
344       aval += 8;
345       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
346       acolidx += 8;
347       aval += 8;
348       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
349       acolidx += 8;
350       aval += 8;
351       j += 3;
352       break;
353     case 2:
354       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
355       acolidx += 8;
356       aval += 8;
357       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
358       acolidx += 8;
359       aval += 8;
360       j += 2;
361       break;
362     case 1:
363       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
364       acolidx += 8;
365       aval += 8;
366       j += 1;
367       break;
368     }
369   #pragma novector
370     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
371       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
372       acolidx += 8;
373       aval += 8;
374       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
375       acolidx += 8;
376       aval += 8;
377       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
378       acolidx += 8;
379       aval += 8;
380       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
381       acolidx += 8;
382       aval += 8;
383     }
384 
385     vec_y = _mm512_add_pd(vec_y, vec_y2);
386     vec_y = _mm512_add_pd(vec_y, vec_y3);
387     vec_y = _mm512_add_pd(vec_y, vec_y4);
388     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
389       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
390       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
391     } else {
392       _mm512_storeu_pd(&y[8 * i], vec_y);
393     }
394   }
395 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
396   for (i = 0; i < totalslices; i++) { /* loop over full slices */
397     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
398     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
399 
400     /* last slice may have padding rows. Don't use vectorization. */
401     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
402       rows_left = A->rmap->n - 8 * i;
403       for (r = 0; r < rows_left; ++r) {
404         yval       = (MatScalar)0;
405         row        = 8 * i + r;
406         nnz_in_row = a->rlen[row];
407         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
408         y[row] = yval;
409       }
410       break;
411     }
412 
413     vec_y  = _mm256_setzero_pd();
414     vec_y2 = _mm256_setzero_pd();
415 
416   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
417   #pragma novector
418   #pragma unroll(2)
419     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
420       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
421       aval += 4;
422       acolidx += 4;
423       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
424       aval += 4;
425       acolidx += 4;
426     }
427 
428     _mm256_storeu_pd(y + i * 8, vec_y);
429     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
430   }
431 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
432   for (i = 0; i < totalslices; i++) { /* loop over full slices */
433     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
434     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
435 
436     vec_y  = _mm256_setzero_pd();
437     vec_y2 = _mm256_setzero_pd();
438 
439     /* last slice may have padding rows. Don't use vectorization. */
440     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
441       rows_left = A->rmap->n - 8 * i;
442       for (r = 0; r < rows_left; ++r) {
443         yval       = (MatScalar)0;
444         row        = 8 * i + r;
445         nnz_in_row = a->rlen[row];
446         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
447         y[row] = yval;
448       }
449       break;
450     }
451 
452   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
453   #pragma novector
454   #pragma unroll(2)
455     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
456       vec_vals  = _mm256_loadu_pd(aval);
457       vec_x_tmp = _mm_setzero_pd();
458       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
459       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
460       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
461       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
462       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
463       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
464       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
465       aval += 4;
466 
467       vec_vals  = _mm256_loadu_pd(aval);
468       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
469       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
470       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
471       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
472       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
473       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
474       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
475       aval += 4;
476     }
477 
478     _mm256_storeu_pd(y + i * 8, vec_y);
479     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
480   }
481 #else
482   for (i = 0; i < totalslices; i++) { /* loop over slices */
483     for (j = 0; j < 8; j++) sum[j] = 0.0;
484     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
485       sum[0] += aval[j] * x[acolidx[j]];
486       sum[1] += aval[j + 1] * x[acolidx[j + 1]];
487       sum[2] += aval[j + 2] * x[acolidx[j + 2]];
488       sum[3] += aval[j + 3] * x[acolidx[j + 3]];
489       sum[4] += aval[j + 4] * x[acolidx[j + 4]];
490       sum[5] += aval[j + 5] * x[acolidx[j + 5]];
491       sum[6] += aval[j + 6] * x[acolidx[j + 6]];
492       sum[7] += aval[j + 7] * x[acolidx[j + 7]];
493     }
494     if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
495       for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j];
496     } else {
497       for (j = 0; j < 8; j++) y[8 * i + j] = sum[j];
498     }
499   }
500 #endif
501 
502   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
503   PetscCall(VecRestoreArrayRead(xx, &x));
504   PetscCall(VecRestoreArray(yy, &y));
505   PetscFunctionReturn(0);
506 }
507 
508 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
509 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
510 {
511   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
512   PetscScalar       *y, *z;
513   const PetscScalar *x;
514   const MatScalar   *aval        = a->val;
515   PetscInt           totalslices = a->totalslices;
516   const PetscInt    *acolidx     = a->colidx;
517   PetscInt           i, j;
518 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
519   __m512d  vec_x, vec_y, vec_vals;
520   __m256i  vec_idx;
521   __mmask8 mask;
522   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
523   __m256i  vec_idx2, vec_idx3, vec_idx4;
524 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
525   __m128d   vec_x_tmp;
526   __m256d   vec_x, vec_y, vec_y2, vec_vals;
527   MatScalar yval;
528   PetscInt  r, row, nnz_in_row;
529 #else
530   PetscScalar sum[8];
531 #endif
532 
533 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
534   #pragma disjoint(*x, *y, *aval)
535 #endif
536 
537   PetscFunctionBegin;
538   PetscCall(VecGetArrayRead(xx, &x));
539   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
540 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
541   for (i = 0; i < totalslices; i++) { /* loop over slices */
542     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
543     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
544 
545     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
546       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
547       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
548     } else {
549       vec_y = _mm512_loadu_pd(&y[8 * i]);
550     }
551     vec_y2 = _mm512_setzero_pd();
552     vec_y3 = _mm512_setzero_pd();
553     vec_y4 = _mm512_setzero_pd();
554 
555     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
556     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
557     case 3:
558       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
559       acolidx += 8;
560       aval += 8;
561       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
562       acolidx += 8;
563       aval += 8;
564       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
565       acolidx += 8;
566       aval += 8;
567       j += 3;
568       break;
569     case 2:
570       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
571       acolidx += 8;
572       aval += 8;
573       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
574       acolidx += 8;
575       aval += 8;
576       j += 2;
577       break;
578     case 1:
579       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
580       acolidx += 8;
581       aval += 8;
582       j += 1;
583       break;
584     }
585   #pragma novector
586     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
587       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
588       acolidx += 8;
589       aval += 8;
590       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
591       acolidx += 8;
592       aval += 8;
593       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
594       acolidx += 8;
595       aval += 8;
596       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
597       acolidx += 8;
598       aval += 8;
599     }
600 
601     vec_y = _mm512_add_pd(vec_y, vec_y2);
602     vec_y = _mm512_add_pd(vec_y, vec_y3);
603     vec_y = _mm512_add_pd(vec_y, vec_y4);
604     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
605       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
606     } else {
607       _mm512_storeu_pd(&z[8 * i], vec_y);
608     }
609   }
610 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
611   for (i = 0; i < totalslices; i++) { /* loop over full slices */
612     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
613     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
614 
615     /* last slice may have padding rows. Don't use vectorization. */
616     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
617       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
618         row        = 8 * i + r;
619         yval       = (MatScalar)0.0;
620         nnz_in_row = a->rlen[row];
621         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
622         z[row] = y[row] + yval;
623       }
624       break;
625     }
626 
627     vec_y  = _mm256_loadu_pd(y + 8 * i);
628     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
629 
630     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
631     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
632       vec_vals  = _mm256_loadu_pd(aval);
633       vec_x_tmp = _mm_setzero_pd();
634       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
635       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
636       vec_x     = _mm256_setzero_pd();
637       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
638       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
639       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
640       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
641       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
642       aval += 4;
643 
644       vec_vals  = _mm256_loadu_pd(aval);
645       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
646       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
647       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
648       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
649       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
650       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
651       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
652       aval += 4;
653     }
654 
655     _mm256_storeu_pd(z + i * 8, vec_y);
656     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
657   }
658 #else
659   for (i = 0; i < totalslices; i++) { /* loop over slices */
660     for (j = 0; j < 8; j++) sum[j] = 0.0;
661     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
662       sum[0] += aval[j] * x[acolidx[j]];
663       sum[1] += aval[j + 1] * x[acolidx[j + 1]];
664       sum[2] += aval[j + 2] * x[acolidx[j + 2]];
665       sum[3] += aval[j + 3] * x[acolidx[j + 3]];
666       sum[4] += aval[j + 4] * x[acolidx[j + 4]];
667       sum[5] += aval[j + 5] * x[acolidx[j + 5]];
668       sum[6] += aval[j + 6] * x[acolidx[j + 6]];
669       sum[7] += aval[j + 7] * x[acolidx[j + 7]];
670     }
671     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
672       for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j];
673     } else {
674       for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j];
675     }
676   }
677 #endif
678 
679   PetscCall(PetscLogFlops(2.0 * a->nz));
680   PetscCall(VecRestoreArrayRead(xx, &x));
681   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
682   PetscFunctionReturn(0);
683 }
684 
685 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
686 {
687   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
688   PetscScalar       *y;
689   const PetscScalar *x;
690   const MatScalar   *aval    = a->val;
691   const PetscInt    *acolidx = a->colidx;
692   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices;
693 
694 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
695   #pragma disjoint(*x, *y, *aval)
696 #endif
697 
698   PetscFunctionBegin;
699   if (A->symmetric == PETSC_BOOL3_TRUE) {
700     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
701     PetscFunctionReturn(0);
702   }
703   if (zz != yy) PetscCall(VecCopy(zz, yy));
704   PetscCall(VecGetArrayRead(xx, &x));
705   PetscCall(VecGetArray(yy, &y));
706   for (i = 0; i < a->totalslices; i++) { /* loop over slices */
707     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
708       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
709         row        = 8 * i + r;
710         nnz_in_row = a->rlen[row];
711         for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row];
712       }
713       break;
714     }
715     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
716       y[acolidx[j]] += aval[j] * x[8 * i];
717       y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1];
718       y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2];
719       y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3];
720       y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4];
721       y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5];
722       y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6];
723       y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7];
724     }
725   }
726   PetscCall(PetscLogFlops(2.0 * a->sliidx[a->totalslices]));
727   PetscCall(VecRestoreArrayRead(xx, &x));
728   PetscCall(VecRestoreArray(yy, &y));
729   PetscFunctionReturn(0);
730 }
731 
732 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
733 {
734   PetscFunctionBegin;
735   if (A->symmetric == PETSC_BOOL3_TRUE) {
736     PetscCall(MatMult_SeqSELL(A, xx, yy));
737   } else {
738     PetscCall(VecSet(yy, 0.0));
739     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
740   }
741   PetscFunctionReturn(0);
742 }
743 
744 /*
745      Checks for missing diagonals
746 */
747 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
748 {
749   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
750   PetscInt    *diag, i;
751 
752   PetscFunctionBegin;
753   *missing = PETSC_FALSE;
754   if (A->rmap->n > 0 && !(a->colidx)) {
755     *missing = PETSC_TRUE;
756     if (d) *d = 0;
757     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
758   } else {
759     diag = a->diag;
760     for (i = 0; i < A->rmap->n; i++) {
761       if (diag[i] == -1) {
762         *missing = PETSC_TRUE;
763         if (d) *d = i;
764         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
765         break;
766       }
767     }
768   }
769   PetscFunctionReturn(0);
770 }
771 
772 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
773 {
774   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
775   PetscInt     i, j, m = A->rmap->n, shift;
776 
777   PetscFunctionBegin;
778   if (!a->diag) {
779     PetscCall(PetscMalloc1(m, &a->diag));
780     a->free_diag = PETSC_TRUE;
781   }
782   for (i = 0; i < m; i++) {                      /* loop over rows */
783     shift      = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
784     a->diag[i] = -1;
785     for (j = 0; j < a->rlen[i]; j++) {
786       if (a->colidx[shift + j * 8] == i) {
787         a->diag[i] = shift + j * 8;
788         break;
789       }
790     }
791   }
792   PetscFunctionReturn(0);
793 }
794 
795 /*
796   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
797 */
798 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
799 {
800   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
801   PetscInt     i, *diag, m = A->rmap->n;
802   MatScalar   *val = a->val;
803   PetscScalar *idiag, *mdiag;
804 
805   PetscFunctionBegin;
806   if (a->idiagvalid) PetscFunctionReturn(0);
807   PetscCall(MatMarkDiagonal_SeqSELL(A));
808   diag = a->diag;
809   if (!a->idiag) {
810     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
811     val = a->val;
812   }
813   mdiag = a->mdiag;
814   idiag = a->idiag;
815 
816   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
817     for (i = 0; i < m; i++) {
818       mdiag[i] = val[diag[i]];
819       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
820         if (PetscRealPart(fshift)) {
821           PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
822           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
823           A->factorerror_zeropivot_value = 0.0;
824           A->factorerror_zeropivot_row   = i;
825         } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
826       }
827       idiag[i] = 1.0 / val[diag[i]];
828     }
829     PetscCall(PetscLogFlops(m));
830   } else {
831     for (i = 0; i < m; i++) {
832       mdiag[i] = val[diag[i]];
833       idiag[i] = omega / (fshift + val[diag[i]]);
834     }
835     PetscCall(PetscLogFlops(2.0 * m));
836   }
837   a->idiagvalid = PETSC_TRUE;
838   PetscFunctionReturn(0);
839 }
840 
841 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
842 {
843   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
844 
845   PetscFunctionBegin;
846   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
847   PetscCall(MatSeqSELLInvalidateDiagonal(A));
848   PetscFunctionReturn(0);
849 }
850 
851 PetscErrorCode MatDestroy_SeqSELL(Mat A)
852 {
853   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
854 
855   PetscFunctionBegin;
856 #if defined(PETSC_USE_LOG)
857   PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz);
858 #endif
859   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
860   PetscCall(ISDestroy(&a->row));
861   PetscCall(ISDestroy(&a->col));
862   PetscCall(PetscFree(a->diag));
863   PetscCall(PetscFree(a->rlen));
864   PetscCall(PetscFree(a->sliidx));
865   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
866   PetscCall(PetscFree(a->solve_work));
867   PetscCall(ISDestroy(&a->icol));
868   PetscCall(PetscFree(a->saved_values));
869   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
870 
871   PetscCall(PetscFree(A->data));
872 
873   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
874   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
875   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
876   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
877   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
878   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
879   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
880   PetscFunctionReturn(0);
881 }
882 
883 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
884 {
885   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
886 
887   PetscFunctionBegin;
888   switch (op) {
889   case MAT_ROW_ORIENTED:
890     a->roworiented = flg;
891     break;
892   case MAT_KEEP_NONZERO_PATTERN:
893     a->keepnonzeropattern = flg;
894     break;
895   case MAT_NEW_NONZERO_LOCATIONS:
896     a->nonew = (flg ? 0 : 1);
897     break;
898   case MAT_NEW_NONZERO_LOCATION_ERR:
899     a->nonew = (flg ? -1 : 0);
900     break;
901   case MAT_NEW_NONZERO_ALLOCATION_ERR:
902     a->nonew = (flg ? -2 : 0);
903     break;
904   case MAT_UNUSED_NONZERO_LOCATION_ERR:
905     a->nounused = (flg ? -1 : 0);
906     break;
907   case MAT_FORCE_DIAGONAL_ENTRIES:
908   case MAT_IGNORE_OFF_PROC_ENTRIES:
909   case MAT_USE_HASH_TABLE:
910   case MAT_SORTED_FULL:
911     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
912     break;
913   case MAT_SPD:
914   case MAT_SYMMETRIC:
915   case MAT_STRUCTURALLY_SYMMETRIC:
916   case MAT_HERMITIAN:
917   case MAT_SYMMETRY_ETERNAL:
918   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
919   case MAT_SPD_ETERNAL:
920     /* These options are handled directly by MatSetOption() */
921     break;
922   default:
923     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
924   }
925   PetscFunctionReturn(0);
926 }
927 
928 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
929 {
930   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
931   PetscInt     i, j, n, shift;
932   PetscScalar *x, zero = 0.0;
933 
934   PetscFunctionBegin;
935   PetscCall(VecGetLocalSize(v, &n));
936   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
937 
938   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
939     PetscInt *diag = a->diag;
940     PetscCall(VecGetArray(v, &x));
941     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
942     PetscCall(VecRestoreArray(v, &x));
943     PetscFunctionReturn(0);
944   }
945 
946   PetscCall(VecSet(v, zero));
947   PetscCall(VecGetArray(v, &x));
948   for (i = 0; i < n; i++) {                 /* loop over rows */
949     shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
950     x[i]  = 0;
951     for (j = 0; j < a->rlen[i]; j++) {
952       if (a->colidx[shift + j * 8] == i) {
953         x[i] = a->val[shift + j * 8];
954         break;
955       }
956     }
957   }
958   PetscCall(VecRestoreArray(v, &x));
959   PetscFunctionReturn(0);
960 }
961 
962 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
963 {
964   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
965   const PetscScalar *l, *r;
966   PetscInt           i, j, m, n, row;
967 
968   PetscFunctionBegin;
969   if (ll) {
970     /* The local size is used so that VecMPI can be passed to this routine
971        by MatDiagonalScale_MPISELL */
972     PetscCall(VecGetLocalSize(ll, &m));
973     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
974     PetscCall(VecGetArrayRead(ll, &l));
975     for (i = 0; i < a->totalslices; i++) {                  /* loop over slices */
976       if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
977         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
978           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row];
979         }
980       } else {
981         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row];
982       }
983     }
984     PetscCall(VecRestoreArrayRead(ll, &l));
985     PetscCall(PetscLogFlops(a->nz));
986   }
987   if (rr) {
988     PetscCall(VecGetLocalSize(rr, &n));
989     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
990     PetscCall(VecGetArrayRead(rr, &r));
991     for (i = 0; i < a->totalslices; i++) {                  /* loop over slices */
992       if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
993         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
994           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
995         }
996       } else {
997         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
998       }
999     }
1000     PetscCall(VecRestoreArrayRead(rr, &r));
1001     PetscCall(PetscLogFlops(a->nz));
1002   }
1003   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1004   PetscFunctionReturn(0);
1005 }
1006 
1007 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1008 {
1009   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1010   PetscInt    *cp, i, k, low, high, t, row, col, l;
1011   PetscInt     shift;
1012   MatScalar   *vp;
1013 
1014   PetscFunctionBegin;
1015   for (k = 0; k < m; k++) { /* loop over requested rows */
1016     row = im[k];
1017     if (row < 0) continue;
1018     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1019     shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1020     cp    = a->colidx + shift;                  /* pointer to the row */
1021     vp    = a->val + shift;                     /* pointer to the row */
1022     for (l = 0; l < n; l++) {                   /* loop over requested columns */
1023       col = in[l];
1024       if (col < 0) continue;
1025       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1026       high = a->rlen[row];
1027       low  = 0; /* assume unsorted */
1028       while (high - low > 5) {
1029         t = (low + high) / 2;
1030         if (*(cp + t * 8) > col) high = t;
1031         else low = t;
1032       }
1033       for (i = low; i < high; i++) {
1034         if (*(cp + 8 * i) > col) break;
1035         if (*(cp + 8 * i) == col) {
1036           *v++ = *(vp + 8 * i);
1037           goto finished;
1038         }
1039       }
1040       *v++ = 0.0;
1041     finished:;
1042     }
1043   }
1044   PetscFunctionReturn(0);
1045 }
1046 
1047 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1048 {
1049   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1050   PetscInt          i, j, m = A->rmap->n, shift;
1051   const char       *name;
1052   PetscViewerFormat format;
1053 
1054   PetscFunctionBegin;
1055   PetscCall(PetscViewerGetFormat(viewer, &format));
1056   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1057     PetscInt nofinalvalue = 0;
1058     /*
1059     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1060       nofinalvalue = 1;
1061     }
1062     */
1063     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1064     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1065     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1066 #if defined(PETSC_USE_COMPLEX)
1067     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1068 #else
1069     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1070 #endif
1071     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1072 
1073     for (i = 0; i < m; i++) {
1074       shift = a->sliidx[i >> 3] + (i & 0x07);
1075       for (j = 0; j < a->rlen[i]; j++) {
1076 #if defined(PETSC_USE_COMPLEX)
1077         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1078 #else
1079         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j]));
1080 #endif
1081       }
1082     }
1083     /*
1084     if (nofinalvalue) {
1085 #if defined(PETSC_USE_COMPLEX)
1086       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1087 #else
1088       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1089 #endif
1090     }
1091     */
1092     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1093     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1094     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1095   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1096     PetscFunctionReturn(0);
1097   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1098     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1099     for (i = 0; i < m; i++) {
1100       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1101       shift = a->sliidx[i >> 3] + (i & 0x07);
1102       for (j = 0; j < a->rlen[i]; j++) {
1103 #if defined(PETSC_USE_COMPLEX)
1104         if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1105           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1106         } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1107           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1108         } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1109           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1110         }
1111 #else
1112         if (a->val[shift + 8 * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1113 #endif
1114       }
1115       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1116     }
1117     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1118   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1119     PetscInt    cnt = 0, jcnt;
1120     PetscScalar value;
1121 #if defined(PETSC_USE_COMPLEX)
1122     PetscBool realonly = PETSC_TRUE;
1123     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1124       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1125         realonly = PETSC_FALSE;
1126         break;
1127       }
1128     }
1129 #endif
1130 
1131     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1132     for (i = 0; i < m; i++) {
1133       jcnt  = 0;
1134       shift = a->sliidx[i >> 3] + (i & 0x07);
1135       for (j = 0; j < A->cmap->n; j++) {
1136         if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) {
1137           value = a->val[cnt++];
1138           jcnt++;
1139         } else {
1140           value = 0.0;
1141         }
1142 #if defined(PETSC_USE_COMPLEX)
1143         if (realonly) {
1144           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1145         } else {
1146           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1147         }
1148 #else
1149         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1150 #endif
1151       }
1152       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1153     }
1154     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1155   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1156     PetscInt fshift = 1;
1157     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1158 #if defined(PETSC_USE_COMPLEX)
1159     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1160 #else
1161     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1162 #endif
1163     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1164     for (i = 0; i < m; i++) {
1165       shift = a->sliidx[i >> 3] + (i & 0x07);
1166       for (j = 0; j < a->rlen[i]; j++) {
1167 #if defined(PETSC_USE_COMPLEX)
1168         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1169 #else
1170         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j]));
1171 #endif
1172       }
1173     }
1174     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1175   } else if (format == PETSC_VIEWER_NATIVE) {
1176     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1177       PetscInt row;
1178       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1179       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
1180 #if defined(PETSC_USE_COMPLEX)
1181         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1182           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1183         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1184           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1185         } else {
1186           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1187         }
1188 #else
1189         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j]));
1190 #endif
1191       }
1192     }
1193   } else {
1194     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1195     if (A->factortype) {
1196       for (i = 0; i < m; i++) {
1197         shift = a->sliidx[i >> 3] + (i & 0x07);
1198         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1199         /* L part */
1200         for (j = shift; j < a->diag[i]; j += 8) {
1201 #if defined(PETSC_USE_COMPLEX)
1202           if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) {
1203             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1204           } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) {
1205             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1206           } else {
1207             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1208           }
1209 #else
1210           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1211 #endif
1212         }
1213         /* diagonal */
1214         j = a->diag[i];
1215 #if defined(PETSC_USE_COMPLEX)
1216         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1217           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1218         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1219           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1220         } else {
1221           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1222         }
1223 #else
1224         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1225 #endif
1226 
1227         /* U part */
1228         for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) {
1229 #if defined(PETSC_USE_COMPLEX)
1230           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1231             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1232           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1233             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1234           } else {
1235             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1236           }
1237 #else
1238           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1239 #endif
1240         }
1241         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1242       }
1243     } else {
1244       for (i = 0; i < m; i++) {
1245         shift = a->sliidx[i >> 3] + (i & 0x07);
1246         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1247         for (j = 0; j < a->rlen[i]; j++) {
1248 #if defined(PETSC_USE_COMPLEX)
1249           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1250             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1251           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1252             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1253           } else {
1254             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1255           }
1256 #else
1257           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1258 #endif
1259         }
1260         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1261       }
1262     }
1263     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1264   }
1265   PetscCall(PetscViewerFlush(viewer));
1266   PetscFunctionReturn(0);
1267 }
1268 
1269 #include <petscdraw.h>
1270 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1271 {
1272   Mat               A = (Mat)Aa;
1273   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1274   PetscInt          i, j, m = A->rmap->n, shift;
1275   int               color;
1276   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1277   PetscViewer       viewer;
1278   PetscViewerFormat format;
1279 
1280   PetscFunctionBegin;
1281   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1282   PetscCall(PetscViewerGetFormat(viewer, &format));
1283   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1284 
1285   /* loop over matrix elements drawing boxes */
1286 
1287   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1288     PetscDrawCollectiveBegin(draw);
1289     /* Blue for negative, Cyan for zero and  Red for positive */
1290     color = PETSC_DRAW_BLUE;
1291     for (i = 0; i < m; i++) {
1292       shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1293       y_l   = m - i - 1.0;
1294       y_r   = y_l + 1.0;
1295       for (j = 0; j < a->rlen[i]; j++) {
1296         x_l = a->colidx[shift + j * 8];
1297         x_r = x_l + 1.0;
1298         if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue;
1299         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1300       }
1301     }
1302     color = PETSC_DRAW_CYAN;
1303     for (i = 0; i < m; i++) {
1304       shift = a->sliidx[i >> 3] + (i & 0x07);
1305       y_l   = m - i - 1.0;
1306       y_r   = y_l + 1.0;
1307       for (j = 0; j < a->rlen[i]; j++) {
1308         x_l = a->colidx[shift + j * 8];
1309         x_r = x_l + 1.0;
1310         if (a->val[shift + 8 * j] != 0.) continue;
1311         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1312       }
1313     }
1314     color = PETSC_DRAW_RED;
1315     for (i = 0; i < m; i++) {
1316       shift = a->sliidx[i >> 3] + (i & 0x07);
1317       y_l   = m - i - 1.0;
1318       y_r   = y_l + 1.0;
1319       for (j = 0; j < a->rlen[i]; j++) {
1320         x_l = a->colidx[shift + j * 8];
1321         x_r = x_l + 1.0;
1322         if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue;
1323         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1324       }
1325     }
1326     PetscDrawCollectiveEnd(draw);
1327   } else {
1328     /* use contour shading to indicate magnitude of values */
1329     /* first determine max of all nonzero values */
1330     PetscReal minv = 0.0, maxv = 0.0;
1331     PetscInt  count = 0;
1332     PetscDraw popup;
1333     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1334       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1335     }
1336     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1337     PetscCall(PetscDrawGetPopup(draw, &popup));
1338     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1339 
1340     PetscDrawCollectiveBegin(draw);
1341     for (i = 0; i < m; i++) {
1342       shift = a->sliidx[i >> 3] + (i & 0x07);
1343       y_l   = m - i - 1.0;
1344       y_r   = y_l + 1.0;
1345       for (j = 0; j < a->rlen[i]; j++) {
1346         x_l   = a->colidx[shift + j * 8];
1347         x_r   = x_l + 1.0;
1348         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1349         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1350         count++;
1351       }
1352     }
1353     PetscDrawCollectiveEnd(draw);
1354   }
1355   PetscFunctionReturn(0);
1356 }
1357 
1358 #include <petscdraw.h>
1359 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1360 {
1361   PetscDraw draw;
1362   PetscReal xr, yr, xl, yl, h, w;
1363   PetscBool isnull;
1364 
1365   PetscFunctionBegin;
1366   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1367   PetscCall(PetscDrawIsNull(draw, &isnull));
1368   if (isnull) PetscFunctionReturn(0);
1369 
1370   xr = A->cmap->n;
1371   yr = A->rmap->n;
1372   h  = yr / 10.0;
1373   w  = xr / 10.0;
1374   xr += w;
1375   yr += h;
1376   xl = -w;
1377   yl = -h;
1378   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1379   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1380   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1381   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1382   PetscCall(PetscDrawSave(draw));
1383   PetscFunctionReturn(0);
1384 }
1385 
1386 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1387 {
1388   PetscBool iascii, isbinary, isdraw;
1389 
1390   PetscFunctionBegin;
1391   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1392   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1393   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1394   if (iascii) {
1395     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1396   } else if (isbinary) {
1397     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1398   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1399   PetscFunctionReturn(0);
1400 }
1401 
1402 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1403 {
1404   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1405   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1406   MatScalar   *vp;
1407 
1408   PetscFunctionBegin;
1409   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1410   /* To do: compress out the unused elements */
1411   PetscCall(MatMarkDiagonal_SeqSELL(A));
1412   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1413   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1414   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1415   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1416   for (i = 0; i < a->totalslices; ++i) {
1417     shift = a->sliidx[i];                                      /* starting index of the slice */
1418     cp    = a->colidx + shift;                                 /* pointer to the column indices of the slice */
1419     vp    = a->val + shift;                                    /* pointer to the nonzero values of the slice */
1420     for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */
1421       row  = 8 * i + row_in_slice;
1422       nrow = a->rlen[row]; /* number of nonzeros in row */
1423       /*
1424         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1425         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1426       */
1427       lastcol = 0;
1428       if (nrow > 0) {                                /* nonempty row */
1429         lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1430       } else if (!row_in_slice) {                    /* first row of the currect slice is empty */
1431         for (j = 1; j < 8; j++) {
1432           if (a->rlen[8 * i + j]) {
1433             lastcol = cp[j];
1434             break;
1435           }
1436         }
1437       } else {
1438         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1439       }
1440 
1441       for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) {
1442         cp[8 * k + row_in_slice] = lastcol;
1443         vp[8 * k + row_in_slice] = (MatScalar)0;
1444       }
1445     }
1446   }
1447 
1448   A->info.mallocs += a->reallocs;
1449   a->reallocs = 0;
1450 
1451   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1452   PetscFunctionReturn(0);
1453 }
1454 
1455 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1456 {
1457   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1458 
1459   PetscFunctionBegin;
1460   info->block_size   = 1.0;
1461   info->nz_allocated = a->maxallocmat;
1462   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1463   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1464   info->assemblies   = A->num_ass;
1465   info->mallocs      = A->info.mallocs;
1466   info->memory       = 0; /* REVIEW ME */
1467   if (A->factortype) {
1468     info->fill_ratio_given  = A->info.fill_ratio_given;
1469     info->fill_ratio_needed = A->info.fill_ratio_needed;
1470     info->factor_mallocs    = A->info.factor_mallocs;
1471   } else {
1472     info->fill_ratio_given  = 0;
1473     info->fill_ratio_needed = 0;
1474     info->factor_mallocs    = 0;
1475   }
1476   PetscFunctionReturn(0);
1477 }
1478 
1479 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1480 {
1481   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1482   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1483   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1484   MatScalar   *vp, value;
1485 
1486   PetscFunctionBegin;
1487   for (k = 0; k < m; k++) { /* loop over added rows */
1488     row = im[k];
1489     if (row < 0) continue;
1490     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1491     shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1492     cp    = a->colidx + shift;                  /* pointer to the row */
1493     vp    = a->val + shift;                     /* pointer to the row */
1494     nrow  = a->rlen[row];
1495     low   = 0;
1496     high  = nrow;
1497 
1498     for (l = 0; l < n; l++) { /* loop over added columns */
1499       col = in[l];
1500       if (col < 0) continue;
1501       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1502       if (a->roworiented) {
1503         value = v[l + k * n];
1504       } else {
1505         value = v[k + l * m];
1506       }
1507       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1508 
1509       /* search in this row for the specified column, i indicates the column to be set */
1510       if (col <= lastcol) low = 0;
1511       else high = nrow;
1512       lastcol = col;
1513       while (high - low > 5) {
1514         t = (low + high) / 2;
1515         if (*(cp + t * 8) > col) high = t;
1516         else low = t;
1517       }
1518       for (i = low; i < high; i++) {
1519         if (*(cp + i * 8) > col) break;
1520         if (*(cp + i * 8) == col) {
1521           if (is == ADD_VALUES) *(vp + i * 8) += value;
1522           else *(vp + i * 8) = value;
1523           low = i + 1;
1524           goto noinsert;
1525         }
1526       }
1527       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1528       if (nonew == 1) goto noinsert;
1529       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1530       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1531       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar);
1532       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1533       for (ii = nrow - 1; ii >= i; ii--) {
1534         *(cp + (ii + 1) * 8) = *(cp + ii * 8);
1535         *(vp + (ii + 1) * 8) = *(vp + ii * 8);
1536       }
1537       a->rlen[row]++;
1538       *(cp + i * 8) = col;
1539       *(vp + i * 8) = value;
1540       a->nz++;
1541       A->nonzerostate++;
1542       low = i + 1;
1543       high++;
1544       nrow++;
1545     noinsert:;
1546     }
1547     a->rlen[row] = nrow;
1548   }
1549   PetscFunctionReturn(0);
1550 }
1551 
1552 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1553 {
1554   PetscFunctionBegin;
1555   /* If the two matrices have the same copy implementation, use fast copy. */
1556   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1557     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1558     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1559 
1560     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1561     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1562   } else {
1563     PetscCall(MatCopy_Basic(A, B, str));
1564   }
1565   PetscFunctionReturn(0);
1566 }
1567 
1568 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1569 {
1570   PetscFunctionBegin;
1571   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1572   PetscFunctionReturn(0);
1573 }
1574 
1575 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1576 {
1577   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1578 
1579   PetscFunctionBegin;
1580   *array = a->val;
1581   PetscFunctionReturn(0);
1582 }
1583 
1584 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1585 {
1586   PetscFunctionBegin;
1587   PetscFunctionReturn(0);
1588 }
1589 
1590 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1591 {
1592   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1593   PetscInt     i;
1594   MatScalar   *aval = a->val;
1595 
1596   PetscFunctionBegin;
1597   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1598   PetscFunctionReturn(0);
1599 }
1600 
1601 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1602 {
1603   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1604   PetscInt     i;
1605   MatScalar   *aval = a->val;
1606 
1607   PetscFunctionBegin;
1608   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1609   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1610   PetscFunctionReturn(0);
1611 }
1612 
1613 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1614 {
1615   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1616   MatScalar   *aval   = a->val;
1617   PetscScalar  oalpha = alpha;
1618   PetscBLASInt one    = 1, size;
1619 
1620   PetscFunctionBegin;
1621   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1622   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1623   PetscCall(PetscLogFlops(a->nz));
1624   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1625   PetscFunctionReturn(0);
1626 }
1627 
1628 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1629 {
1630   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1631 
1632   PetscFunctionBegin;
1633   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1634   PetscCall(MatShift_Basic(Y, a));
1635   PetscFunctionReturn(0);
1636 }
1637 
1638 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1639 {
1640   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1641   PetscScalar       *x, sum, *t;
1642   const MatScalar   *idiag = NULL, *mdiag;
1643   const PetscScalar *b, *xb;
1644   PetscInt           n, m = A->rmap->n, i, j, shift;
1645   const PetscInt    *diag;
1646 
1647   PetscFunctionBegin;
1648   its = its * lits;
1649 
1650   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1651   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1652   a->fshift = fshift;
1653   a->omega  = omega;
1654 
1655   diag  = a->diag;
1656   t     = a->ssor_work;
1657   idiag = a->idiag;
1658   mdiag = a->mdiag;
1659 
1660   PetscCall(VecGetArray(xx, &x));
1661   PetscCall(VecGetArrayRead(bb, &b));
1662   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1663   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1664   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1665   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1666 
1667   if (flag & SOR_ZERO_INITIAL_GUESS) {
1668     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1669       for (i = 0; i < m; i++) {
1670         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1671         sum   = b[i];
1672         n     = (diag[i] - shift) / 8;
1673         for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1674         t[i] = sum;
1675         x[i] = sum * idiag[i];
1676       }
1677       xb = t;
1678       PetscCall(PetscLogFlops(a->nz));
1679     } else xb = b;
1680     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1681       for (i = m - 1; i >= 0; i--) {
1682         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1683         sum   = xb[i];
1684         n     = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1685         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1686         if (xb == b) {
1687           x[i] = sum * idiag[i];
1688         } else {
1689           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1690         }
1691       }
1692       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1693     }
1694     its--;
1695   }
1696   while (its--) {
1697     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1698       for (i = 0; i < m; i++) {
1699         /* lower */
1700         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1701         sum   = b[i];
1702         n     = (diag[i] - shift) / 8;
1703         for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1704         t[i] = sum; /* save application of the lower-triangular part */
1705         /* upper */
1706         n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1707         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1708         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1709       }
1710       xb = t;
1711       PetscCall(PetscLogFlops(2.0 * a->nz));
1712     } else xb = b;
1713     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1714       for (i = m - 1; i >= 0; i--) {
1715         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1716         sum   = xb[i];
1717         if (xb == b) {
1718           /* whole matrix (no checkpointing available) */
1719           n = a->rlen[i];
1720           for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1721           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1722         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1723           n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1724           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1725           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1726         }
1727       }
1728       if (xb == b) {
1729         PetscCall(PetscLogFlops(2.0 * a->nz));
1730       } else {
1731         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1732       }
1733     }
1734   }
1735   PetscCall(VecRestoreArray(xx, &x));
1736   PetscCall(VecRestoreArrayRead(bb, &b));
1737   PetscFunctionReturn(0);
1738 }
1739 
1740 /* -------------------------------------------------------------------*/
1741 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1742                                        MatGetRow_SeqSELL,
1743                                        MatRestoreRow_SeqSELL,
1744                                        MatMult_SeqSELL,
1745                                        /* 4*/ MatMultAdd_SeqSELL,
1746                                        MatMultTranspose_SeqSELL,
1747                                        MatMultTransposeAdd_SeqSELL,
1748                                        NULL,
1749                                        NULL,
1750                                        NULL,
1751                                        /* 10*/ NULL,
1752                                        NULL,
1753                                        NULL,
1754                                        MatSOR_SeqSELL,
1755                                        NULL,
1756                                        /* 15*/ MatGetInfo_SeqSELL,
1757                                        MatEqual_SeqSELL,
1758                                        MatGetDiagonal_SeqSELL,
1759                                        MatDiagonalScale_SeqSELL,
1760                                        NULL,
1761                                        /* 20*/ NULL,
1762                                        MatAssemblyEnd_SeqSELL,
1763                                        MatSetOption_SeqSELL,
1764                                        MatZeroEntries_SeqSELL,
1765                                        /* 24*/ NULL,
1766                                        NULL,
1767                                        NULL,
1768                                        NULL,
1769                                        NULL,
1770                                        /* 29*/ MatSetUp_SeqSELL,
1771                                        NULL,
1772                                        NULL,
1773                                        NULL,
1774                                        NULL,
1775                                        /* 34*/ MatDuplicate_SeqSELL,
1776                                        NULL,
1777                                        NULL,
1778                                        NULL,
1779                                        NULL,
1780                                        /* 39*/ NULL,
1781                                        NULL,
1782                                        NULL,
1783                                        MatGetValues_SeqSELL,
1784                                        MatCopy_SeqSELL,
1785                                        /* 44*/ NULL,
1786                                        MatScale_SeqSELL,
1787                                        MatShift_SeqSELL,
1788                                        NULL,
1789                                        NULL,
1790                                        /* 49*/ NULL,
1791                                        NULL,
1792                                        NULL,
1793                                        NULL,
1794                                        NULL,
1795                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1796                                        NULL,
1797                                        NULL,
1798                                        NULL,
1799                                        NULL,
1800                                        /* 59*/ NULL,
1801                                        MatDestroy_SeqSELL,
1802                                        MatView_SeqSELL,
1803                                        NULL,
1804                                        NULL,
1805                                        /* 64*/ NULL,
1806                                        NULL,
1807                                        NULL,
1808                                        NULL,
1809                                        NULL,
1810                                        /* 69*/ NULL,
1811                                        NULL,
1812                                        NULL,
1813                                        NULL,
1814                                        NULL,
1815                                        /* 74*/ NULL,
1816                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1817                                        NULL,
1818                                        NULL,
1819                                        NULL,
1820                                        /* 79*/ NULL,
1821                                        NULL,
1822                                        NULL,
1823                                        NULL,
1824                                        NULL,
1825                                        /* 84*/ NULL,
1826                                        NULL,
1827                                        NULL,
1828                                        NULL,
1829                                        NULL,
1830                                        /* 89*/ NULL,
1831                                        NULL,
1832                                        NULL,
1833                                        NULL,
1834                                        NULL,
1835                                        /* 94*/ NULL,
1836                                        NULL,
1837                                        NULL,
1838                                        NULL,
1839                                        NULL,
1840                                        /* 99*/ NULL,
1841                                        NULL,
1842                                        NULL,
1843                                        MatConjugate_SeqSELL,
1844                                        NULL,
1845                                        /*104*/ NULL,
1846                                        NULL,
1847                                        NULL,
1848                                        NULL,
1849                                        NULL,
1850                                        /*109*/ NULL,
1851                                        NULL,
1852                                        NULL,
1853                                        NULL,
1854                                        MatMissingDiagonal_SeqSELL,
1855                                        /*114*/ NULL,
1856                                        NULL,
1857                                        NULL,
1858                                        NULL,
1859                                        NULL,
1860                                        /*119*/ NULL,
1861                                        NULL,
1862                                        NULL,
1863                                        NULL,
1864                                        NULL,
1865                                        /*124*/ NULL,
1866                                        NULL,
1867                                        NULL,
1868                                        NULL,
1869                                        NULL,
1870                                        /*129*/ NULL,
1871                                        NULL,
1872                                        NULL,
1873                                        NULL,
1874                                        NULL,
1875                                        /*134*/ NULL,
1876                                        NULL,
1877                                        NULL,
1878                                        NULL,
1879                                        NULL,
1880                                        /*139*/ NULL,
1881                                        NULL,
1882                                        NULL,
1883                                        MatFDColoringSetUp_SeqXAIJ,
1884                                        NULL,
1885                                        /*144*/ NULL,
1886                                        NULL,
1887                                        NULL,
1888                                        NULL,
1889                                        NULL,
1890                                        NULL,
1891                                        /*150*/ NULL};
1892 
1893 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1894 {
1895   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1896 
1897   PetscFunctionBegin;
1898   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1899 
1900   /* allocate space for values if not already there */
1901   if (!a->saved_values) { PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); }
1902 
1903   /* copy values over */
1904   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1905   PetscFunctionReturn(0);
1906 }
1907 
1908 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1909 {
1910   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1911 
1912   PetscFunctionBegin;
1913   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1914   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1915   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1916   PetscFunctionReturn(0);
1917 }
1918 
1919 /*@C
1920  MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()`
1921 
1922  Not Collective
1923 
1924  Input Parameters:
1925  .  mat - a `MATSEQSELL` matrix
1926  .  array - pointer to the data
1927 
1928  Level: intermediate
1929 
1930  .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
1931  @*/
1932 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array)
1933 {
1934   PetscFunctionBegin;
1935   PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
1936   PetscFunctionReturn(0);
1937 }
1938 
1939 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1940 {
1941   Mat_SeqSELL *b;
1942   PetscMPIInt  size;
1943 
1944   PetscFunctionBegin;
1945   PetscCall(PetscCitationsRegister(citation, &cited));
1946   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
1947   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
1948 
1949   PetscCall(PetscNew(&b));
1950 
1951   B->data = (void *)b;
1952 
1953   PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps)));
1954 
1955   b->row                = NULL;
1956   b->col                = NULL;
1957   b->icol               = NULL;
1958   b->reallocs           = 0;
1959   b->ignorezeroentries  = PETSC_FALSE;
1960   b->roworiented        = PETSC_TRUE;
1961   b->nonew              = 0;
1962   b->diag               = NULL;
1963   b->solve_work         = NULL;
1964   B->spptr              = NULL;
1965   b->saved_values       = NULL;
1966   b->idiag              = NULL;
1967   b->mdiag              = NULL;
1968   b->ssor_work          = NULL;
1969   b->omega              = 1.0;
1970   b->fshift             = 0.0;
1971   b->idiagvalid         = PETSC_FALSE;
1972   b->keepnonzeropattern = PETSC_FALSE;
1973 
1974   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
1975   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
1976   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
1977   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
1978   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
1979   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
1980   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
1981   PetscFunctionReturn(0);
1982 }
1983 
1984 /*
1985  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1986  */
1987 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
1988 {
1989   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
1990   PetscInt     i, m                           = A->rmap->n;
1991   PetscInt     totalslices = a->totalslices;
1992 
1993   PetscFunctionBegin;
1994   C->factortype = A->factortype;
1995   c->row        = NULL;
1996   c->col        = NULL;
1997   c->icol       = NULL;
1998   c->reallocs   = 0;
1999   C->assembled  = PETSC_TRUE;
2000 
2001   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2002   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2003 
2004   PetscCall(PetscMalloc1(8 * totalslices, &c->rlen));
2005   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2006 
2007   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2008   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2009 
2010   /* allocate the matrix space */
2011   if (mallocmatspace) {
2012     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2013 
2014     c->singlemalloc = PETSC_TRUE;
2015 
2016     if (m > 0) {
2017       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2018       if (cpvalues == MAT_COPY_VALUES) {
2019         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2020       } else {
2021         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2022       }
2023     }
2024   }
2025 
2026   c->ignorezeroentries = a->ignorezeroentries;
2027   c->roworiented       = a->roworiented;
2028   c->nonew             = a->nonew;
2029   if (a->diag) {
2030     PetscCall(PetscMalloc1(m, &c->diag));
2031     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2032   } else c->diag = NULL;
2033 
2034   c->solve_work         = NULL;
2035   c->saved_values       = NULL;
2036   c->idiag              = NULL;
2037   c->ssor_work          = NULL;
2038   c->keepnonzeropattern = a->keepnonzeropattern;
2039   c->free_val           = PETSC_TRUE;
2040   c->free_colidx        = PETSC_TRUE;
2041 
2042   c->maxallocmat  = a->maxallocmat;
2043   c->maxallocrow  = a->maxallocrow;
2044   c->rlenmax      = a->rlenmax;
2045   c->nz           = a->nz;
2046   C->preallocated = PETSC_TRUE;
2047 
2048   c->nonzerorowcnt = a->nonzerorowcnt;
2049   C->nonzerostate  = A->nonzerostate;
2050 
2051   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2052   PetscFunctionReturn(0);
2053 }
2054 
2055 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2056 {
2057   PetscFunctionBegin;
2058   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2059   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2060   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2061   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2062   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2063   PetscFunctionReturn(0);
2064 }
2065 
2066 /*MC
2067    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2068    based on the sliced Ellpack format
2069 
2070    Options Database Keys:
2071 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2072 
2073    Level: beginner
2074 
2075 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2076 M*/
2077 
2078 /*MC
2079    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2080 
2081    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2082    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2083   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2084   for communicators controlling multiple processes.  It is recommended that you call both of
2085   the above preallocation routines for simplicity.
2086 
2087    Options Database Keys:
2088 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2089 
2090   Level: beginner
2091 
2092   Notes:
2093    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2094 
2095    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2096    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2097 
2098   Developer Notes:
2099    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2100 
2101    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2102 .vb
2103                             (2 0  3 4)
2104    Consider the matrix A =  (5 0  6 0)
2105                             (0 0  7 8)
2106                             (0 0  9 9)
2107 
2108    symbolically the Ellpack format can be written as
2109 
2110         (2 3 4 |)           (0 2 3 |)
2111    v =  (5 6 0 |)  colidx = (0 2 2 |)
2112         --------            ---------
2113         (7 8 |)             (2 3 |)
2114         (9 9 |)             (2 3 |)
2115 
2116     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2117     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2118     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2119 
2120     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2121 
2122 .ve
2123 
2124       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2125 
2126  References:
2127 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2128    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2129 
2130 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2131 M*/
2132 
2133 /*@C
2134        MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2135 
2136  Collective on comm
2137 
2138  Input Parameters:
2139 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
2140 .  m - number of rows
2141 .  n - number of columns
2142 .  rlenmax - maximum number of nonzeros in a row
2143 -  rlen - array containing the number of nonzeros in the various rows
2144  (possibly different for each row) or NULL
2145 
2146  Output Parameter:
2147 .  A - the matrix
2148 
2149  It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2150  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2151  [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2152 
2153  Notes:
2154  If nnz is given then nz is ignored
2155 
2156  Specify the preallocated storage with either rlenmax or rlen (not both).
2157  Set rlenmax = `PETSC_DEFAULT` and rlen = NULL for PETSc to control dynamic memory
2158  allocation.  For large problems you MUST preallocate memory or you
2159  will get TERRIBLE performance, see the users' manual chapter on matrices.
2160 
2161  Level: intermediate
2162 
2163  .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2164  @*/
2165 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt maxallocrow, const PetscInt rlen[], Mat *A)
2166 {
2167   PetscFunctionBegin;
2168   PetscCall(MatCreate(comm, A));
2169   PetscCall(MatSetSizes(*A, m, n, m, n));
2170   PetscCall(MatSetType(*A, MATSEQSELL));
2171   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, maxallocrow, rlen));
2172   PetscFunctionReturn(0);
2173 }
2174 
2175 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2176 {
2177   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2178   PetscInt     totalslices = a->totalslices;
2179 
2180   PetscFunctionBegin;
2181   /* If the  matrix dimensions are not equal,or no of nonzeros */
2182   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2183     *flg = PETSC_FALSE;
2184     PetscFunctionReturn(0);
2185   }
2186   /* if the a->colidx are the same */
2187   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2188   if (!*flg) PetscFunctionReturn(0);
2189   /* if a->val are the same */
2190   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2191   PetscFunctionReturn(0);
2192 }
2193 
2194 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2195 {
2196   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2197 
2198   PetscFunctionBegin;
2199   a->idiagvalid = PETSC_FALSE;
2200   PetscFunctionReturn(0);
2201 }
2202 
2203 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2204 {
2205 #if defined(PETSC_USE_COMPLEX)
2206   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2207   PetscInt     i;
2208   PetscScalar *val = a->val;
2209 
2210   PetscFunctionBegin;
2211   for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]);
2212 #else
2213   PetscFunctionBegin;
2214 #endif
2215   PetscFunctionReturn(0);
2216 }
2217