xref: /petsc/src/mat/impls/sell/seq/sell.c (revision 07425a8d4172ec73b7b53c5ce4d6ba1b92fe45cf)
1 /*
2   Defines the basic matrix operations for the SELL matrix storage format.
3 */
4 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
5 #include <petscblaslapack.h>
6 #include <petsc/private/kernels/blocktranspose.h>
7 
8 static PetscBool  cited      = PETSC_FALSE;
9 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
10                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
11                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
12                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
13                                " year = 2018\n"
14                                "}\n";
15 
16 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
17 
18   #include <immintrin.h>
19 
20   #if !defined(_MM_SCALE_8)
21     #define _MM_SCALE_8 8
22   #endif
23 
24   #if defined(__AVX512F__)
25     /* these do not work
26    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
27    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
28   */
29     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
30       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
31       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
32       vec_vals = _mm512_loadu_pd(aval); \
33       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
34       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
35   #elif defined(__AVX2__) && defined(__FMA__)
36     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
37       vec_vals = _mm256_loadu_pd(aval); \
38       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
39       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
40       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
41   #endif
42 #endif /* PETSC_HAVE_IMMINTRIN_H */
43 
44 /*@
45   MatSeqSELLSetPreallocation - For good matrix assembly performance
46   the user should preallocate the matrix storage by setting the parameter `nz`
47   (or the array `nnz`).
48 
49   Collective
50 
51   Input Parameters:
52 + B       - The `MATSEQSELL` matrix
53 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
54 - rlen    - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
55 
56   Level: intermediate
57 
58   Notes:
59   Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
60   Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
61   allocation.
62 
63   You can call `MatGetInfo()` to get information on how effective the preallocation was;
64   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
65   You can also run with the option `-info` and look for messages with the string
66   malloc in them to see if additional memory allocation was needed.
67 
68   Developer Notes:
69   Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
70   entries or columns indices.
71 
72   The maximum number of nonzeos in any row should be as accurate as possible.
73   If it is underestimated, you will get bad performance due to reallocation
74   (`MatSeqXSELLReallocateSELL()`).
75 
76 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
77  @*/
78 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
79 {
80   PetscFunctionBegin;
81   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
82   PetscValidType(B, 1);
83   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
84   PetscFunctionReturn(PETSC_SUCCESS);
85 }
86 
87 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
88 {
89   Mat_SeqSELL *b;
90   PetscInt     i, j, totalslices;
91 #if defined(PETSC_HAVE_CUPM)
92   PetscInt rlenmax = 0;
93 #endif
94   PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
95 
96   PetscFunctionBegin;
97   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
98   if (maxallocrow == MAT_SKIP_ALLOCATION) {
99     skipallocation = PETSC_TRUE;
100     maxallocrow    = 0;
101   }
102 
103   PetscCall(PetscLayoutSetUp(B->rmap));
104   PetscCall(PetscLayoutSetUp(B->cmap));
105 
106   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
107   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
108   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
109   if (rlen) {
110     for (i = 0; i < B->rmap->n; i++) {
111       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
112       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
113     }
114   }
115 
116   B->preallocated = PETSC_TRUE;
117 
118   b = (Mat_SeqSELL *)B->data;
119 
120   if (!b->sliceheight) { /* not set yet */
121 #if defined(PETSC_HAVE_CUPM)
122     b->sliceheight = 16;
123 #else
124     b->sliceheight = 8;
125 #endif
126   }
127   totalslices    = PetscCeilInt(B->rmap->n, b->sliceheight);
128   b->totalslices = totalslices;
129   if (!skipallocation) {
130     if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n));
131 
132     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
133       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
134     }
135     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
136       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
137       else if (maxallocrow < 0) maxallocrow = 1;
138 #if defined(PETSC_HAVE_CUPM)
139       rlenmax = maxallocrow;
140       /* Pad the slice to DEVICE_MEM_ALIGN */
141       while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++;
142 #endif
143       for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow;
144     } else {
145 #if defined(PETSC_HAVE_CUPM)
146       PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight;
147 #endif
148       maxallocrow  = 0;
149       b->sliidx[0] = 0;
150       for (i = 1; i < totalslices; i++) {
151         b->sliidx[i] = 0;
152         for (j = 0; j < b->sliceheight; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]);
153 #if defined(PETSC_HAVE_CUPM)
154         if (mul != 0) { /* Pad the slice to DEVICE_MEM_ALIGN if sliceheight < DEVICE_MEM_ALIGN */
155           rlenmax      = PetscMax(b->sliidx[i], rlenmax);
156           b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul;
157         }
158 #endif
159         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
160         PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i]));
161       }
162       /* last slice */
163       b->sliidx[totalslices] = 0;
164       for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
165 #if defined(PETSC_HAVE_CUPM)
166       if (mul != 0) {
167         rlenmax                = PetscMax(b->sliidx[i], rlenmax);
168         b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul;
169       }
170 #endif
171       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
172       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices];
173     }
174 
175     /* allocate space for val, colidx, rlen */
176     /* FIXME: should B's old memory be unlogged? */
177     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
178     /* FIXME: assuming an element of the bit array takes 8 bits */
179     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
180     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
181     PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen));
182 
183     b->singlemalloc = PETSC_TRUE;
184     b->free_val     = PETSC_TRUE;
185     b->free_colidx  = PETSC_TRUE;
186   } else {
187     b->free_val    = PETSC_FALSE;
188     b->free_colidx = PETSC_FALSE;
189   }
190 
191   b->nz          = 0;
192   b->maxallocrow = maxallocrow;
193 #if defined(PETSC_HAVE_CUPM)
194   b->rlenmax = rlenmax;
195 #else
196   b->rlenmax = maxallocrow;
197 #endif
198   b->maxallocmat      = b->sliidx[totalslices];
199   B->info.nz_unneeded = (double)b->maxallocmat;
200   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
201   PetscFunctionReturn(PETSC_SUCCESS);
202 }
203 
204 static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
205 {
206   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
207   PetscInt     shift;
208 
209   PetscFunctionBegin;
210   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
211   if (nz) *nz = a->rlen[row];
212   shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight);
213   if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals));
214   if (idx) {
215     PetscInt j;
216     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j];
217     *idx = a->getrowcols;
218   }
219   if (v) {
220     PetscInt j;
221     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j];
222     *v = a->getrowvals;
223   }
224   PetscFunctionReturn(PETSC_SUCCESS);
225 }
226 
227 static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
228 {
229   PetscFunctionBegin;
230   PetscFunctionReturn(PETSC_SUCCESS);
231 }
232 
233 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
234 {
235   Mat          B;
236   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
237   PetscInt     i;
238 
239   PetscFunctionBegin;
240   if (reuse == MAT_REUSE_MATRIX) {
241     B = *newmat;
242     PetscCall(MatZeroEntries(B));
243   } else {
244     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
245     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
246     PetscCall(MatSetType(B, MATSEQAIJ));
247     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
248   }
249 
250   for (i = 0; i < A->rmap->n; i++) {
251     PetscInt     nz = 0, *cols = NULL;
252     PetscScalar *vals = NULL;
253 
254     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
255     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
256     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
257   }
258 
259   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
260   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
261   B->rmap->bs = A->rmap->bs;
262 
263   if (reuse == MAT_INPLACE_MATRIX) {
264     PetscCall(MatHeaderReplace(A, &B));
265   } else {
266     *newmat = B;
267   }
268   PetscFunctionReturn(PETSC_SUCCESS);
269 }
270 
271 #include <../src/mat/impls/aij/seq/aij.h>
272 
273 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
274 {
275   Mat                B;
276   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
277   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
278   const PetscInt    *cols;
279   const PetscScalar *vals;
280 
281   PetscFunctionBegin;
282   if (reuse == MAT_REUSE_MATRIX) {
283     B = *newmat;
284   } else {
285     if (PetscDefined(USE_DEBUG) || !a->ilen) {
286       PetscCall(PetscMalloc1(m, &rowlengths));
287       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
288     }
289     if (PetscDefined(USE_DEBUG) && a->ilen) {
290       PetscBool eq;
291       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
292       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
293       PetscCall(PetscFree(rowlengths));
294       rowlengths = a->ilen;
295     } else if (a->ilen) rowlengths = a->ilen;
296     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
297     PetscCall(MatSetSizes(B, m, n, m, n));
298     PetscCall(MatSetType(B, MATSEQSELL));
299     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
300     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
301   }
302 
303   for (row = 0; row < m; row++) {
304     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
305     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
306     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
307   }
308   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
309   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
310   B->rmap->bs = A->rmap->bs;
311 
312   if (reuse == MAT_INPLACE_MATRIX) {
313     PetscCall(MatHeaderReplace(A, &B));
314   } else {
315     *newmat = B;
316   }
317   PetscFunctionReturn(PETSC_SUCCESS);
318 }
319 
320 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
321 {
322   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
323   PetscScalar       *y;
324   const PetscScalar *x;
325   const MatScalar   *aval        = a->val;
326   PetscInt           totalslices = a->totalslices;
327   const PetscInt    *acolidx     = a->colidx;
328   PetscInt           i, j;
329 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
330   __m512d  vec_x, vec_y, vec_vals;
331   __m256i  vec_idx;
332   __mmask8 mask;
333   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
334   __m256i  vec_idx2, vec_idx3, vec_idx4;
335 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
336   __m128i   vec_idx;
337   __m256d   vec_x, vec_y, vec_y2, vec_vals;
338   MatScalar yval;
339   PetscInt  r, rows_left, row, nnz_in_row;
340 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
341   __m128d   vec_x_tmp;
342   __m256d   vec_x, vec_y, vec_y2, vec_vals;
343   MatScalar yval;
344   PetscInt  r, rows_left, row, nnz_in_row;
345 #else
346   PetscInt     k, sliceheight = a->sliceheight;
347   PetscScalar *sum;
348 #endif
349 
350 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
351   #pragma disjoint(*x, *y, *aval)
352 #endif
353 
354   PetscFunctionBegin;
355   PetscCall(VecGetArrayRead(xx, &x));
356   PetscCall(VecGetArray(yy, &y));
357 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
358   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
359   for (i = 0; i < totalslices; i++) { /* loop over slices */
360     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
361     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
362 
363     vec_y  = _mm512_setzero_pd();
364     vec_y2 = _mm512_setzero_pd();
365     vec_y3 = _mm512_setzero_pd();
366     vec_y4 = _mm512_setzero_pd();
367 
368     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
369     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
370     case 3:
371       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
372       acolidx += 8;
373       aval += 8;
374       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
375       acolidx += 8;
376       aval += 8;
377       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
378       acolidx += 8;
379       aval += 8;
380       j += 3;
381       break;
382     case 2:
383       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
384       acolidx += 8;
385       aval += 8;
386       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
387       acolidx += 8;
388       aval += 8;
389       j += 2;
390       break;
391     case 1:
392       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
393       acolidx += 8;
394       aval += 8;
395       j += 1;
396       break;
397     }
398   #pragma novector
399     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
400       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
401       acolidx += 8;
402       aval += 8;
403       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
404       acolidx += 8;
405       aval += 8;
406       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
407       acolidx += 8;
408       aval += 8;
409       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
410       acolidx += 8;
411       aval += 8;
412     }
413 
414     vec_y = _mm512_add_pd(vec_y, vec_y2);
415     vec_y = _mm512_add_pd(vec_y, vec_y3);
416     vec_y = _mm512_add_pd(vec_y, vec_y4);
417     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
418       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
419       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
420     } else {
421       _mm512_storeu_pd(&y[8 * i], vec_y);
422     }
423   }
424 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
425   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
426   for (i = 0; i < totalslices; i++) { /* loop over full slices */
427     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
428     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
429 
430     /* last slice may have padding rows. Don't use vectorization. */
431     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
432       rows_left = A->rmap->n - 8 * i;
433       for (r = 0; r < rows_left; ++r) {
434         yval       = (MatScalar)0;
435         row        = 8 * i + r;
436         nnz_in_row = a->rlen[row];
437         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
438         y[row] = yval;
439       }
440       break;
441     }
442 
443     vec_y  = _mm256_setzero_pd();
444     vec_y2 = _mm256_setzero_pd();
445 
446   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
447   #pragma novector
448   #pragma unroll(2)
449     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
450       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
451       aval += 4;
452       acolidx += 4;
453       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
454       aval += 4;
455       acolidx += 4;
456     }
457 
458     _mm256_storeu_pd(y + i * 8, vec_y);
459     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
460   }
461 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
462   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
463   for (i = 0; i < totalslices; i++) { /* loop over full slices */
464     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
465     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
466 
467     vec_y  = _mm256_setzero_pd();
468     vec_y2 = _mm256_setzero_pd();
469 
470     /* last slice may have padding rows. Don't use vectorization. */
471     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
472       rows_left = A->rmap->n - 8 * i;
473       for (r = 0; r < rows_left; ++r) {
474         yval       = (MatScalar)0;
475         row        = 8 * i + r;
476         nnz_in_row = a->rlen[row];
477         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
478         y[row] = yval;
479       }
480       break;
481     }
482 
483   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
484   #pragma novector
485   #pragma unroll(2)
486     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
487       vec_vals  = _mm256_loadu_pd(aval);
488       vec_x_tmp = _mm_setzero_pd();
489       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
490       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
491       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
492       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
493       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
494       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
495       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
496       aval += 4;
497 
498       vec_vals  = _mm256_loadu_pd(aval);
499       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
500       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
501       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
502       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
503       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
504       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
505       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
506       aval += 4;
507     }
508 
509     _mm256_storeu_pd(y + i * 8, vec_y);
510     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
511   }
512 #else
513   PetscCall(PetscMalloc1(sliceheight, &sum));
514   for (i = 0; i < totalslices; i++) { /* loop over slices */
515     for (j = 0; j < sliceheight; j++) {
516       sum[j] = 0.0;
517       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
518     }
519     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */
520       for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j];
521     } else {
522       for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j];
523     }
524   }
525   PetscCall(PetscFree(sum));
526 #endif
527 
528   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
529   PetscCall(VecRestoreArrayRead(xx, &x));
530   PetscCall(VecRestoreArray(yy, &y));
531   PetscFunctionReturn(PETSC_SUCCESS);
532 }
533 
534 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
535 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
536 {
537   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
538   PetscScalar       *y, *z;
539   const PetscScalar *x;
540   const MatScalar   *aval        = a->val;
541   PetscInt           totalslices = a->totalslices;
542   const PetscInt    *acolidx     = a->colidx;
543   PetscInt           i, j;
544 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
545   __m512d  vec_x, vec_y, vec_vals;
546   __m256i  vec_idx;
547   __mmask8 mask = 0;
548   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
549   __m256i  vec_idx2, vec_idx3, vec_idx4;
550 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
551   __m128d   vec_x_tmp;
552   __m256d   vec_x, vec_y, vec_y2, vec_vals;
553   MatScalar yval;
554   PetscInt  r, row, nnz_in_row;
555 #else
556   PetscInt     k, sliceheight = a->sliceheight;
557   PetscScalar *sum;
558 #endif
559 
560 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
561   #pragma disjoint(*x, *y, *aval)
562 #endif
563 
564   PetscFunctionBegin;
565   if (!a->nz) {
566     PetscCall(VecCopy(yy, zz));
567     PetscFunctionReturn(PETSC_SUCCESS);
568   }
569   PetscCall(VecGetArrayRead(xx, &x));
570   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
571 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
572   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
573   for (i = 0; i < totalslices; i++) { /* loop over slices */
574     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
575     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
576 
577     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
578       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
579       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
580     } else {
581       vec_y = _mm512_loadu_pd(&y[8 * i]);
582     }
583     vec_y2 = _mm512_setzero_pd();
584     vec_y3 = _mm512_setzero_pd();
585     vec_y4 = _mm512_setzero_pd();
586 
587     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
588     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
589     case 3:
590       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
591       acolidx += 8;
592       aval += 8;
593       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
594       acolidx += 8;
595       aval += 8;
596       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
597       acolidx += 8;
598       aval += 8;
599       j += 3;
600       break;
601     case 2:
602       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
603       acolidx += 8;
604       aval += 8;
605       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
606       acolidx += 8;
607       aval += 8;
608       j += 2;
609       break;
610     case 1:
611       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
612       acolidx += 8;
613       aval += 8;
614       j += 1;
615       break;
616     }
617   #pragma novector
618     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
619       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
620       acolidx += 8;
621       aval += 8;
622       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
623       acolidx += 8;
624       aval += 8;
625       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
626       acolidx += 8;
627       aval += 8;
628       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
629       acolidx += 8;
630       aval += 8;
631     }
632 
633     vec_y = _mm512_add_pd(vec_y, vec_y2);
634     vec_y = _mm512_add_pd(vec_y, vec_y3);
635     vec_y = _mm512_add_pd(vec_y, vec_y4);
636     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
637       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
638     } else {
639       _mm512_storeu_pd(&z[8 * i], vec_y);
640     }
641   }
642 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
643   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
644   for (i = 0; i < totalslices; i++) { /* loop over full slices */
645     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
646     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
647 
648     /* last slice may have padding rows. Don't use vectorization. */
649     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
650       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
651         row        = 8 * i + r;
652         yval       = (MatScalar)0.0;
653         nnz_in_row = a->rlen[row];
654         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
655         z[row] = y[row] + yval;
656       }
657       break;
658     }
659 
660     vec_y  = _mm256_loadu_pd(y + 8 * i);
661     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
662 
663     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
664     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
665       vec_vals  = _mm256_loadu_pd(aval);
666       vec_x_tmp = _mm_setzero_pd();
667       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
668       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
669       vec_x     = _mm256_setzero_pd();
670       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
671       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
672       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
673       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
674       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
675       aval += 4;
676 
677       vec_vals  = _mm256_loadu_pd(aval);
678       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
679       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
680       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
681       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
682       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
683       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
684       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
685       aval += 4;
686     }
687 
688     _mm256_storeu_pd(z + i * 8, vec_y);
689     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
690   }
691 #else
692   PetscCall(PetscMalloc1(sliceheight, &sum));
693   for (i = 0; i < totalslices; i++) { /* loop over slices */
694     for (j = 0; j < sliceheight; j++) {
695       sum[j] = 0.0;
696       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
697     }
698     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
699       for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
700     } else {
701       for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
702     }
703   }
704   PetscCall(PetscFree(sum));
705 #endif
706 
707   PetscCall(PetscLogFlops(2.0 * a->nz));
708   PetscCall(VecRestoreArrayRead(xx, &x));
709   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
710   PetscFunctionReturn(PETSC_SUCCESS);
711 }
712 
713 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
714 {
715   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
716   PetscScalar       *y;
717   const PetscScalar *x;
718   const MatScalar   *aval    = a->val;
719   const PetscInt    *acolidx = a->colidx;
720   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight;
721 
722 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
723   #pragma disjoint(*x, *y, *aval)
724 #endif
725 
726   PetscFunctionBegin;
727   if (A->symmetric == PETSC_BOOL3_TRUE) {
728     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
729     PetscFunctionReturn(PETSC_SUCCESS);
730   }
731   if (zz != yy) PetscCall(VecCopy(zz, yy));
732 
733   if (a->nz) {
734     PetscCall(VecGetArrayRead(xx, &x));
735     PetscCall(VecGetArray(yy, &y));
736     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
737       if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
738         for (r = 0; r < (A->rmap->n % sliceheight); ++r) {
739           row        = sliceheight * i + r;
740           nnz_in_row = a->rlen[row];
741           for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row];
742         }
743         break;
744       }
745       for (r = 0; r < sliceheight; ++r)
746         for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r];
747     }
748     PetscCall(PetscLogFlops(2.0 * a->nz));
749     PetscCall(VecRestoreArrayRead(xx, &x));
750     PetscCall(VecRestoreArray(yy, &y));
751   }
752   PetscFunctionReturn(PETSC_SUCCESS);
753 }
754 
755 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
756 {
757   PetscFunctionBegin;
758   if (A->symmetric == PETSC_BOOL3_TRUE) {
759     PetscCall(MatMult_SeqSELL(A, xx, yy));
760   } else {
761     PetscCall(VecSet(yy, 0.0));
762     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
763   }
764   PetscFunctionReturn(PETSC_SUCCESS);
765 }
766 
767 /*
768      Checks for missing diagonals
769 */
770 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
771 {
772   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
773   PetscInt    *diag, i;
774 
775   PetscFunctionBegin;
776   *missing = PETSC_FALSE;
777   if (A->rmap->n > 0 && !a->colidx) {
778     *missing = PETSC_TRUE;
779     if (d) *d = 0;
780     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
781   } else {
782     diag = a->diag;
783     for (i = 0; i < A->rmap->n; i++) {
784       if (diag[i] == -1) {
785         *missing = PETSC_TRUE;
786         if (d) *d = i;
787         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
788         break;
789       }
790     }
791   }
792   PetscFunctionReturn(PETSC_SUCCESS);
793 }
794 
795 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
796 {
797   Mat_SeqSELL   *a = (Mat_SeqSELL *)A->data;
798   const PetscInt m = A->rmap->n;
799   PetscInt       shift;
800 
801   PetscFunctionBegin;
802   if (a->diag && a->diagNonzeroState == A->nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
803   if (!a->diag) {
804     PetscCall(PetscMalloc1(m, &a->diag));
805     a->free_diag = PETSC_TRUE;
806   }
807   a->diagDense = PETSC_TRUE;
808   for (PetscInt i = 0; i < m; i++) { /* loop over rows */
809     PetscBool found = PETSC_FALSE;
810 
811     shift      = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
812     a->diag[i] = -1;
813     for (PetscInt j = 0; j < a->rlen[i]; j++) {
814       if (a->colidx[shift + a->sliceheight * j] == i) {
815         a->diag[i] = shift + a->sliceheight * j;
816         found      = PETSC_TRUE;
817         break;
818       }
819     }
820     if (!found) a->diagDense = PETSC_FALSE;
821   }
822   a->diagNonzeroState = A->nonzerostate;
823   PetscFunctionReturn(PETSC_SUCCESS);
824 }
825 
826 /*
827   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
828 */
829 static PetscErrorCode MatInvertDiagonalForSOR_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
830 {
831   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
832   PetscInt     i, *diag, m = A->rmap->n;
833   MatScalar   *val = a->val;
834   PetscScalar *idiag, *mdiag;
835 
836   PetscFunctionBegin;
837   if (a->idiagState == ((PetscObject)A)->state && a->omega == omega && a->fshift == fshift) PetscFunctionReturn(PETSC_SUCCESS);
838   PetscCall(MatMarkDiagonal_SeqSELL(A));
839   PetscCheck(a->diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must have all diagonal location to invert them");
840   diag = a->diag;
841   if (!a->idiag) {
842     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
843     val = a->val;
844   }
845   mdiag = a->mdiag;
846   idiag = a->idiag;
847 
848   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
849     for (i = 0; i < m; i++) {
850       mdiag[i] = val[diag[i]];
851       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
852         PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
853         PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
854         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
855         A->factorerror_zeropivot_value = 0.0;
856         A->factorerror_zeropivot_row   = i;
857       }
858       idiag[i] = 1.0 / val[diag[i]];
859     }
860     PetscCall(PetscLogFlops(m));
861   } else {
862     for (i = 0; i < m; i++) {
863       mdiag[i] = val[diag[i]];
864       idiag[i] = omega / (fshift + val[diag[i]]);
865     }
866     PetscCall(PetscLogFlops(2.0 * m));
867   }
868   a->idiagState = ((PetscObject)A)->state;
869   a->omega      = omega;
870   a->fshift     = fshift;
871   PetscFunctionReturn(PETSC_SUCCESS);
872 }
873 
874 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
875 {
876   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
877 
878   PetscFunctionBegin;
879   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
880   PetscFunctionReturn(PETSC_SUCCESS);
881 }
882 
883 PetscErrorCode MatDestroy_SeqSELL(Mat A)
884 {
885   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
886 
887   PetscFunctionBegin;
888   PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
889   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
890   PetscCall(ISDestroy(&a->row));
891   PetscCall(ISDestroy(&a->col));
892   PetscCall(PetscFree(a->diag));
893   PetscCall(PetscFree(a->rlen));
894   PetscCall(PetscFree(a->sliidx));
895   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
896   PetscCall(PetscFree(a->solve_work));
897   PetscCall(ISDestroy(&a->icol));
898   PetscCall(PetscFree(a->saved_values));
899   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
900   PetscCall(PetscFree(A->data));
901 #if defined(PETSC_HAVE_CUPM)
902   PetscCall(PetscFree(a->chunk_slice_map));
903 #endif
904 
905   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
906   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
907   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
908   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
909   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
910   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
911   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
912 #if defined(PETSC_HAVE_CUDA)
913   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL));
914 #endif
915 #if defined(PETSC_HAVE_HIP)
916   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellhip_C", NULL));
917 #endif
918   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL));
919   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL));
920   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL));
921   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL));
922   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL));
923   PetscFunctionReturn(PETSC_SUCCESS);
924 }
925 
926 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
927 {
928   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
929 
930   PetscFunctionBegin;
931   switch (op) {
932   case MAT_ROW_ORIENTED:
933     a->roworiented = flg;
934     break;
935   case MAT_KEEP_NONZERO_PATTERN:
936     a->keepnonzeropattern = flg;
937     break;
938   case MAT_NEW_NONZERO_LOCATIONS:
939     a->nonew = (flg ? 0 : 1);
940     break;
941   case MAT_NEW_NONZERO_LOCATION_ERR:
942     a->nonew = (flg ? -1 : 0);
943     break;
944   case MAT_NEW_NONZERO_ALLOCATION_ERR:
945     a->nonew = (flg ? -2 : 0);
946     break;
947   case MAT_UNUSED_NONZERO_LOCATION_ERR:
948     a->nounused = (flg ? -1 : 0);
949     break;
950   default:
951     break;
952   }
953   PetscFunctionReturn(PETSC_SUCCESS);
954 }
955 
956 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
957 {
958   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
959   PetscInt     i, j, n, shift;
960   PetscScalar *x, zero = 0.0;
961 
962   PetscFunctionBegin;
963   PetscCall(VecGetLocalSize(v, &n));
964   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
965 
966   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
967     PetscInt *diag = a->diag;
968     PetscCall(VecGetArray(v, &x));
969     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
970     PetscCall(VecRestoreArray(v, &x));
971     PetscFunctionReturn(PETSC_SUCCESS);
972   }
973 
974   PetscCall(VecSet(v, zero));
975   PetscCall(VecGetArray(v, &x));
976   for (i = 0; i < n; i++) {                                     /* loop over rows */
977     shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
978     x[i]  = 0;
979     for (j = 0; j < a->rlen[i]; j++) {
980       if (a->colidx[shift + a->sliceheight * j] == i) {
981         x[i] = a->val[shift + a->sliceheight * j];
982         break;
983       }
984     }
985   }
986   PetscCall(VecRestoreArray(v, &x));
987   PetscFunctionReturn(PETSC_SUCCESS);
988 }
989 
990 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
991 {
992   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
993   const PetscScalar *l, *r;
994   PetscInt           i, j, m, n, row;
995 
996   PetscFunctionBegin;
997   if (ll) {
998     /* The local size is used so that VecMPI can be passed to this routine
999        by MatDiagonalScale_MPISELL */
1000     PetscCall(VecGetLocalSize(ll, &m));
1001     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1002     PetscCall(VecGetArrayRead(ll, &l));
1003     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1004       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1005         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1006           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row];
1007         }
1008       } else {
1009         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) a->val[j] *= l[a->sliceheight * i + row];
1010       }
1011     }
1012     PetscCall(VecRestoreArrayRead(ll, &l));
1013     PetscCall(PetscLogFlops(a->nz));
1014   }
1015   if (rr) {
1016     PetscCall(VecGetLocalSize(rr, &n));
1017     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1018     PetscCall(VecGetArrayRead(rr, &r));
1019     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1020       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1021         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) {
1022           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]];
1023         }
1024       } else {
1025         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
1026       }
1027     }
1028     PetscCall(VecRestoreArrayRead(rr, &r));
1029     PetscCall(PetscLogFlops(a->nz));
1030   }
1031 #if defined(PETSC_HAVE_CUPM)
1032   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1033 #endif
1034   PetscFunctionReturn(PETSC_SUCCESS);
1035 }
1036 
1037 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1038 {
1039   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1040   PetscInt    *cp, i, k, low, high, t, row, col, l;
1041   PetscInt     shift;
1042   MatScalar   *vp;
1043 
1044   PetscFunctionBegin;
1045   for (k = 0; k < m; k++) { /* loop over requested rows */
1046     row = im[k];
1047     if (row < 0) continue;
1048     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1049     shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */
1050     cp    = a->colidx + shift;                                        /* pointer to the row */
1051     vp    = a->val + shift;                                           /* pointer to the row */
1052     for (l = 0; l < n; l++) {                                         /* loop over requested columns */
1053       col = in[l];
1054       if (col < 0) continue;
1055       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1056       high = a->rlen[row];
1057       low  = 0; /* assume unsorted */
1058       while (high - low > 5) {
1059         t = (low + high) / 2;
1060         if (*(cp + a->sliceheight * t) > col) high = t;
1061         else low = t;
1062       }
1063       for (i = low; i < high; i++) {
1064         if (*(cp + a->sliceheight * i) > col) break;
1065         if (*(cp + a->sliceheight * i) == col) {
1066           *v++ = *(vp + a->sliceheight * i);
1067           goto finished;
1068         }
1069       }
1070       *v++ = 0.0;
1071     finished:;
1072     }
1073   }
1074   PetscFunctionReturn(PETSC_SUCCESS);
1075 }
1076 
1077 static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1078 {
1079   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1080   PetscInt          i, j, m = A->rmap->n, shift;
1081   const char       *name;
1082   PetscViewerFormat format;
1083 
1084   PetscFunctionBegin;
1085   PetscCall(PetscViewerGetFormat(viewer, &format));
1086   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1087     PetscInt nofinalvalue = 0;
1088     /*
1089     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) nofinalvalue = 1;
1090     */
1091     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1092     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1093     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1094 #if defined(PETSC_USE_COMPLEX)
1095     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1096 #else
1097     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1098 #endif
1099     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1100 
1101     for (i = 0; i < m; i++) {
1102       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1103       for (j = 0; j < a->rlen[i]; j++) {
1104 #if defined(PETSC_USE_COMPLEX)
1105         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1106 #else
1107         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j]));
1108 #endif
1109       }
1110     }
1111     /*
1112     if (nofinalvalue) {
1113 #if defined(PETSC_USE_COMPLEX)
1114       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1115 #else
1116       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1117 #endif
1118     }
1119     */
1120     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1121     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1122     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1123   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1124     PetscFunctionReturn(PETSC_SUCCESS);
1125   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1126     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1127     for (i = 0; i < m; i++) {
1128       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1129       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1130       for (j = 0; j < a->rlen[i]; j++) {
1131 #if defined(PETSC_USE_COMPLEX)
1132         if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1133           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1134         } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1135           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1136         } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1137           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1138         }
1139 #else
1140         if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1141 #endif
1142       }
1143       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1144     }
1145     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1146   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1147     PetscInt    cnt = 0, jcnt;
1148     PetscScalar value;
1149 #if defined(PETSC_USE_COMPLEX)
1150     PetscBool realonly = PETSC_TRUE;
1151     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1152       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1153         realonly = PETSC_FALSE;
1154         break;
1155       }
1156     }
1157 #endif
1158 
1159     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1160     for (i = 0; i < m; i++) {
1161       jcnt  = 0;
1162       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1163       for (j = 0; j < A->cmap->n; j++) {
1164         if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) {
1165           value = a->val[cnt++];
1166           jcnt++;
1167         } else {
1168           value = 0.0;
1169         }
1170 #if defined(PETSC_USE_COMPLEX)
1171         if (realonly) {
1172           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1173         } else {
1174           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1175         }
1176 #else
1177         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1178 #endif
1179       }
1180       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1181     }
1182     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1183   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1184     PetscInt fshift = 1;
1185     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1186 #if defined(PETSC_USE_COMPLEX)
1187     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1188 #else
1189     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1190 #endif
1191     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1192     for (i = 0; i < m; i++) {
1193       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1194       for (j = 0; j < a->rlen[i]; j++) {
1195 #if defined(PETSC_USE_COMPLEX)
1196         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1197 #else
1198         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j]));
1199 #endif
1200       }
1201     }
1202     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1203   } else if (format == PETSC_VIEWER_NATIVE) {
1204     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1205       PetscInt row;
1206       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1207       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1208 #if defined(PETSC_USE_COMPLEX)
1209         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1210           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1211         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1212           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1213         } else {
1214           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1215         }
1216 #else
1217         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j]));
1218 #endif
1219       }
1220     }
1221   } else {
1222     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1223     if (A->factortype) {
1224       for (i = 0; i < m; i++) {
1225         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1226         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1227         /* L part */
1228         for (j = shift; j < a->diag[i]; j += a->sliceheight) {
1229 #if defined(PETSC_USE_COMPLEX)
1230           if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) {
1231             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1232           } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) {
1233             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1234           } else {
1235             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1236           }
1237 #else
1238           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1239 #endif
1240         }
1241         /* diagonal */
1242         j = a->diag[i];
1243 #if defined(PETSC_USE_COMPLEX)
1244         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1245           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1246         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1247           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1248         } else {
1249           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1250         }
1251 #else
1252         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1 / a->val[j])));
1253 #endif
1254 
1255         /* U part */
1256         for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) {
1257 #if defined(PETSC_USE_COMPLEX)
1258           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1259             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1260           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1261             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1262           } else {
1263             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1264           }
1265 #else
1266           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1267 #endif
1268         }
1269         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1270       }
1271     } else {
1272       for (i = 0; i < m; i++) {
1273         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1274         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1275         for (j = 0; j < a->rlen[i]; j++) {
1276 #if defined(PETSC_USE_COMPLEX)
1277           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1278             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1279           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1280             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1281           } else {
1282             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1283           }
1284 #else
1285           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1286 #endif
1287         }
1288         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1289       }
1290     }
1291     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1292   }
1293   PetscCall(PetscViewerFlush(viewer));
1294   PetscFunctionReturn(PETSC_SUCCESS);
1295 }
1296 
1297 #include <petscdraw.h>
1298 static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1299 {
1300   Mat               A = (Mat)Aa;
1301   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1302   PetscInt          i, j, m = A->rmap->n, shift;
1303   int               color;
1304   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1305   PetscViewer       viewer;
1306   PetscViewerFormat format;
1307 
1308   PetscFunctionBegin;
1309   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1310   PetscCall(PetscViewerGetFormat(viewer, &format));
1311   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1312 
1313   /* loop over matrix elements drawing boxes */
1314 
1315   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1316     PetscDrawCollectiveBegin(draw);
1317     /* Blue for negative, Cyan for zero and  Red for positive */
1318     color = PETSC_DRAW_BLUE;
1319     for (i = 0; i < m; i++) {
1320       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1321       y_l   = m - i - 1.0;
1322       y_r   = y_l + 1.0;
1323       for (j = 0; j < a->rlen[i]; j++) {
1324         x_l = a->colidx[shift + a->sliceheight * j];
1325         x_r = x_l + 1.0;
1326         if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue;
1327         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1328       }
1329     }
1330     color = PETSC_DRAW_CYAN;
1331     for (i = 0; i < m; i++) {
1332       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1333       y_l   = m - i - 1.0;
1334       y_r   = y_l + 1.0;
1335       for (j = 0; j < a->rlen[i]; j++) {
1336         x_l = a->colidx[shift + a->sliceheight * j];
1337         x_r = x_l + 1.0;
1338         if (a->val[shift + a->sliceheight * j] != 0.) continue;
1339         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1340       }
1341     }
1342     color = PETSC_DRAW_RED;
1343     for (i = 0; i < m; i++) {
1344       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1345       y_l   = m - i - 1.0;
1346       y_r   = y_l + 1.0;
1347       for (j = 0; j < a->rlen[i]; j++) {
1348         x_l = a->colidx[shift + a->sliceheight * j];
1349         x_r = x_l + 1.0;
1350         if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue;
1351         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1352       }
1353     }
1354     PetscDrawCollectiveEnd(draw);
1355   } else {
1356     /* use contour shading to indicate magnitude of values */
1357     /* first determine max of all nonzero values */
1358     PetscReal minv = 0.0, maxv = 0.0;
1359     PetscInt  count = 0;
1360     PetscDraw popup;
1361     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1362       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1363     }
1364     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1365     PetscCall(PetscDrawGetPopup(draw, &popup));
1366     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1367 
1368     PetscDrawCollectiveBegin(draw);
1369     for (i = 0; i < m; i++) {
1370       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1371       y_l   = m - i - 1.0;
1372       y_r   = y_l + 1.0;
1373       for (j = 0; j < a->rlen[i]; j++) {
1374         x_l   = a->colidx[shift + a->sliceheight * j];
1375         x_r   = x_l + 1.0;
1376         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1377         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1378         count++;
1379       }
1380     }
1381     PetscDrawCollectiveEnd(draw);
1382   }
1383   PetscFunctionReturn(PETSC_SUCCESS);
1384 }
1385 
1386 #include <petscdraw.h>
1387 static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1388 {
1389   PetscDraw draw;
1390   PetscReal xr, yr, xl, yl, h, w;
1391   PetscBool isnull;
1392 
1393   PetscFunctionBegin;
1394   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1395   PetscCall(PetscDrawIsNull(draw, &isnull));
1396   if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1397 
1398   xr = A->cmap->n;
1399   yr = A->rmap->n;
1400   h  = yr / 10.0;
1401   w  = xr / 10.0;
1402   xr += w;
1403   yr += h;
1404   xl = -w;
1405   yl = -h;
1406   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1407   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1408   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1409   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1410   PetscCall(PetscDrawSave(draw));
1411   PetscFunctionReturn(PETSC_SUCCESS);
1412 }
1413 
1414 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1415 {
1416   PetscBool isascii, isbinary, isdraw;
1417 
1418   PetscFunctionBegin;
1419   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii));
1420   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1421   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1422   if (isascii) {
1423     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1424   } else if (isbinary) {
1425     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1426   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1427   PetscFunctionReturn(PETSC_SUCCESS);
1428 }
1429 
1430 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1431 {
1432   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1433   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1434   MatScalar   *vp;
1435 #if defined(PETSC_HAVE_CUPM)
1436   PetscInt totalchunks = 0;
1437 #endif
1438 
1439   PetscFunctionBegin;
1440   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1441   /* To do: compress out the unused elements */
1442   PetscCall(MatMarkDiagonal_SeqSELL(A));
1443   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1444   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1445   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1446   a->nonzerorowcnt = 0;
1447   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1448   for (i = 0; i < a->totalslices; ++i) {
1449     shift = a->sliidx[i];                                                   /* starting index of the slice */
1450     cp    = PetscSafePointerPlusOffset(a->colidx, shift);                   /* pointer to the column indices of the slice */
1451     vp    = PetscSafePointerPlusOffset(a->val, shift);                      /* pointer to the nonzero values of the slice */
1452     for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */
1453       row  = a->sliceheight * i + row_in_slice;
1454       nrow = a->rlen[row]; /* number of nonzeros in row */
1455       /*
1456         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1457         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1458       */
1459       lastcol = 0;
1460       if (nrow > 0) { /* nonempty row */
1461         a->nonzerorowcnt++;
1462         lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1463       } else if (!row_in_slice) {                                 /* first row of the correct slice is empty */
1464         for (j = 1; j < a->sliceheight; j++) {
1465           if (a->rlen[a->sliceheight * i + j]) {
1466             lastcol = cp[j];
1467             break;
1468           }
1469         }
1470       } else {
1471         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1472       }
1473 
1474       for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) {
1475         cp[a->sliceheight * k + row_in_slice] = lastcol;
1476         vp[a->sliceheight * k + row_in_slice] = (MatScalar)0;
1477       }
1478     }
1479   }
1480 
1481   A->info.mallocs += a->reallocs;
1482   a->reallocs = 0;
1483 
1484 #if defined(PETSC_HAVE_CUPM)
1485   if (!a->chunksize && a->totalslices) {
1486     a->chunksize = 64;
1487     while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2;
1488     totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize;
1489   }
1490   if (totalchunks != a->totalchunks) {
1491     PetscCall(PetscFree(a->chunk_slice_map));
1492     PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map));
1493     a->totalchunks = totalchunks;
1494   }
1495   j = 0;
1496   for (i = 0; i < totalchunks; i++) {
1497     while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++;
1498     a->chunk_slice_map[i] = j;
1499   }
1500 #endif
1501   PetscFunctionReturn(PETSC_SUCCESS);
1502 }
1503 
1504 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1505 {
1506   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1507 
1508   PetscFunctionBegin;
1509   info->block_size   = 1.0;
1510   info->nz_allocated = a->maxallocmat;
1511   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1512   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1513   info->assemblies   = A->num_ass;
1514   info->mallocs      = A->info.mallocs;
1515   info->memory       = 0; /* REVIEW ME */
1516   if (A->factortype) {
1517     info->fill_ratio_given  = A->info.fill_ratio_given;
1518     info->fill_ratio_needed = A->info.fill_ratio_needed;
1519     info->factor_mallocs    = A->info.factor_mallocs;
1520   } else {
1521     info->fill_ratio_given  = 0;
1522     info->fill_ratio_needed = 0;
1523     info->factor_mallocs    = 0;
1524   }
1525   PetscFunctionReturn(PETSC_SUCCESS);
1526 }
1527 
1528 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1529 {
1530   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1531   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1532   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1533   MatScalar   *vp, value;
1534 #if defined(PETSC_HAVE_CUPM)
1535   PetscBool inserted = PETSC_FALSE;
1536   PetscInt  mul      = DEVICE_MEM_ALIGN / a->sliceheight;
1537 #endif
1538 
1539   PetscFunctionBegin;
1540   for (k = 0; k < m; k++) { /* loop over added rows */
1541     row = im[k];
1542     if (row < 0) continue;
1543     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1544     shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */
1545     cp    = a->colidx + shift;                                      /* pointer to the row */
1546     vp    = a->val + shift;                                         /* pointer to the row */
1547     nrow  = a->rlen[row];
1548     low   = 0;
1549     high  = nrow;
1550 
1551     for (l = 0; l < n; l++) { /* loop over added columns */
1552       col = in[l];
1553       if (col < 0) continue;
1554       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1555       if (a->roworiented) {
1556         value = v[l + k * n];
1557       } else {
1558         value = v[k + l * m];
1559       }
1560       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1561 
1562       /* search in this row for the specified column, i indicates the column to be set */
1563       if (col <= lastcol) low = 0;
1564       else high = nrow;
1565       lastcol = col;
1566       while (high - low > 5) {
1567         t = (low + high) / 2;
1568         if (*(cp + a->sliceheight * t) > col) high = t;
1569         else low = t;
1570       }
1571       for (i = low; i < high; i++) {
1572         if (*(cp + a->sliceheight * i) > col) break;
1573         if (*(cp + a->sliceheight * i) == col) {
1574           if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value;
1575           else *(vp + a->sliceheight * i) = value;
1576 #if defined(PETSC_HAVE_CUPM)
1577           inserted = PETSC_TRUE;
1578 #endif
1579           low = i + 1;
1580           goto noinsert;
1581         }
1582       }
1583       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1584       if (nonew == 1) goto noinsert;
1585       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1586 #if defined(PETSC_HAVE_CUPM)
1587       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul);
1588 #else
1589       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1590       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1);
1591 #endif
1592       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1593       for (ii = nrow - 1; ii >= i; ii--) {
1594         *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii);
1595         *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii);
1596       }
1597       a->rlen[row]++;
1598       *(cp + a->sliceheight * i) = col;
1599       *(vp + a->sliceheight * i) = value;
1600       a->nz++;
1601 #if defined(PETSC_HAVE_CUPM)
1602       inserted = PETSC_TRUE;
1603 #endif
1604       low = i + 1;
1605       high++;
1606       nrow++;
1607     noinsert:;
1608     }
1609     a->rlen[row] = nrow;
1610   }
1611 #if defined(PETSC_HAVE_CUPM)
1612   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU;
1613 #endif
1614   PetscFunctionReturn(PETSC_SUCCESS);
1615 }
1616 
1617 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1618 {
1619   PetscFunctionBegin;
1620   /* If the two matrices have the same copy implementation, use fast copy. */
1621   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1622     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1623     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1624 
1625     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1626     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1627   } else {
1628     PetscCall(MatCopy_Basic(A, B, str));
1629   }
1630   PetscFunctionReturn(PETSC_SUCCESS);
1631 }
1632 
1633 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1634 {
1635   PetscFunctionBegin;
1636   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1637   PetscFunctionReturn(PETSC_SUCCESS);
1638 }
1639 
1640 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1641 {
1642   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1643 
1644   PetscFunctionBegin;
1645   *array = a->val;
1646   PetscFunctionReturn(PETSC_SUCCESS);
1647 }
1648 
1649 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1650 {
1651   PetscFunctionBegin;
1652   PetscFunctionReturn(PETSC_SUCCESS);
1653 }
1654 
1655 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1656 {
1657   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1658   MatScalar   *aval   = a->val;
1659   PetscScalar  oalpha = alpha;
1660   PetscBLASInt one    = 1, size;
1661 
1662   PetscFunctionBegin;
1663   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1664   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1665   PetscCall(PetscLogFlops(a->nz));
1666 #if defined(PETSC_HAVE_CUPM)
1667   if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU;
1668 #endif
1669   PetscFunctionReturn(PETSC_SUCCESS);
1670 }
1671 
1672 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1673 {
1674   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1675 
1676   PetscFunctionBegin;
1677   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1678   PetscCall(MatShift_Basic(Y, a));
1679   PetscFunctionReturn(PETSC_SUCCESS);
1680 }
1681 
1682 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1683 {
1684   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1685   PetscScalar       *x, sum, *t;
1686   const MatScalar   *idiag = NULL, *mdiag;
1687   const PetscScalar *b, *xb;
1688   PetscInt           n, m = A->rmap->n, i, j, shift;
1689   const PetscInt    *diag;
1690 
1691   PetscFunctionBegin;
1692   its = its * lits;
1693 
1694   PetscCall(MatInvertDiagonalForSOR_SeqSELL(A, omega, fshift));
1695 
1696   diag  = a->diag;
1697   t     = a->ssor_work;
1698   idiag = a->idiag;
1699   mdiag = a->mdiag;
1700 
1701   PetscCall(VecGetArray(xx, &x));
1702   PetscCall(VecGetArrayRead(bb, &b));
1703   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1704   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1705   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1706   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1707 
1708   if (flag & SOR_ZERO_INITIAL_GUESS) {
1709     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1710       for (i = 0; i < m; i++) {
1711         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1712         sum   = b[i];
1713         n     = (diag[i] - shift) / a->sliceheight;
1714         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1715         t[i] = sum;
1716         x[i] = sum * idiag[i];
1717       }
1718       xb = t;
1719       PetscCall(PetscLogFlops(a->nz));
1720     } else xb = b;
1721     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1722       for (i = m - 1; i >= 0; i--) {
1723         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1724         sum   = xb[i];
1725         n     = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1726         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1727         if (xb == b) {
1728           x[i] = sum * idiag[i];
1729         } else {
1730           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1731         }
1732       }
1733       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1734     }
1735     its--;
1736   }
1737   while (its--) {
1738     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1739       for (i = 0; i < m; i++) {
1740         /* lower */
1741         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1742         sum   = b[i];
1743         n     = (diag[i] - shift) / a->sliceheight;
1744         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1745         t[i] = sum; /* save application of the lower-triangular part */
1746         /* upper */
1747         n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1748         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1749         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1750       }
1751       xb = t;
1752       PetscCall(PetscLogFlops(2.0 * a->nz));
1753     } else xb = b;
1754     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1755       for (i = m - 1; i >= 0; i--) {
1756         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1757         sum   = xb[i];
1758         if (xb == b) {
1759           /* whole matrix (no checkpointing available) */
1760           n = a->rlen[i];
1761           for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1762           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1763         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1764           n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1765           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1766           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1767         }
1768       }
1769       if (xb == b) {
1770         PetscCall(PetscLogFlops(2.0 * a->nz));
1771       } else {
1772         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1773       }
1774     }
1775   }
1776   PetscCall(VecRestoreArray(xx, &x));
1777   PetscCall(VecRestoreArrayRead(bb, &b));
1778   PetscFunctionReturn(PETSC_SUCCESS);
1779 }
1780 
1781 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1782                                        MatGetRow_SeqSELL,
1783                                        MatRestoreRow_SeqSELL,
1784                                        MatMult_SeqSELL,
1785                                        /* 4*/ MatMultAdd_SeqSELL,
1786                                        MatMultTranspose_SeqSELL,
1787                                        MatMultTransposeAdd_SeqSELL,
1788                                        NULL,
1789                                        NULL,
1790                                        NULL,
1791                                        /* 10*/ NULL,
1792                                        NULL,
1793                                        NULL,
1794                                        MatSOR_SeqSELL,
1795                                        NULL,
1796                                        /* 15*/ MatGetInfo_SeqSELL,
1797                                        MatEqual_SeqSELL,
1798                                        MatGetDiagonal_SeqSELL,
1799                                        MatDiagonalScale_SeqSELL,
1800                                        NULL,
1801                                        /* 20*/ NULL,
1802                                        MatAssemblyEnd_SeqSELL,
1803                                        MatSetOption_SeqSELL,
1804                                        MatZeroEntries_SeqSELL,
1805                                        /* 24*/ NULL,
1806                                        NULL,
1807                                        NULL,
1808                                        NULL,
1809                                        NULL,
1810                                        /* 29*/ MatSetUp_SeqSELL,
1811                                        NULL,
1812                                        NULL,
1813                                        NULL,
1814                                        NULL,
1815                                        /* 34*/ MatDuplicate_SeqSELL,
1816                                        NULL,
1817                                        NULL,
1818                                        NULL,
1819                                        NULL,
1820                                        /* 39*/ NULL,
1821                                        NULL,
1822                                        NULL,
1823                                        MatGetValues_SeqSELL,
1824                                        MatCopy_SeqSELL,
1825                                        /* 44*/ NULL,
1826                                        MatScale_SeqSELL,
1827                                        MatShift_SeqSELL,
1828                                        NULL,
1829                                        NULL,
1830                                        /* 49*/ NULL,
1831                                        NULL,
1832                                        NULL,
1833                                        NULL,
1834                                        NULL,
1835                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1836                                        NULL,
1837                                        NULL,
1838                                        NULL,
1839                                        NULL,
1840                                        /* 59*/ NULL,
1841                                        MatDestroy_SeqSELL,
1842                                        MatView_SeqSELL,
1843                                        NULL,
1844                                        NULL,
1845                                        /* 64*/ NULL,
1846                                        NULL,
1847                                        NULL,
1848                                        NULL,
1849                                        NULL,
1850                                        /* 69*/ NULL,
1851                                        NULL,
1852                                        NULL,
1853                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1854                                        NULL,
1855                                        /* 74*/ NULL,
1856                                        NULL,
1857                                        NULL,
1858                                        NULL,
1859                                        NULL,
1860                                        /* 79*/ NULL,
1861                                        NULL,
1862                                        NULL,
1863                                        NULL,
1864                                        NULL,
1865                                        /* 84*/ NULL,
1866                                        NULL,
1867                                        NULL,
1868                                        NULL,
1869                                        NULL,
1870                                        /* 89*/ NULL,
1871                                        NULL,
1872                                        NULL,
1873                                        NULL,
1874                                        MatConjugate_SeqSELL,
1875                                        /* 94*/ NULL,
1876                                        NULL,
1877                                        NULL,
1878                                        NULL,
1879                                        NULL,
1880                                        /* 99*/ NULL,
1881                                        NULL,
1882                                        NULL,
1883                                        NULL,
1884                                        NULL,
1885                                        /*104*/ MatMissingDiagonal_SeqSELL,
1886                                        NULL,
1887                                        NULL,
1888                                        NULL,
1889                                        NULL,
1890                                        /*109*/ NULL,
1891                                        NULL,
1892                                        NULL,
1893                                        NULL,
1894                                        NULL,
1895                                        /*114*/ NULL,
1896                                        NULL,
1897                                        NULL,
1898                                        NULL,
1899                                        NULL,
1900                                        /*119*/ NULL,
1901                                        NULL,
1902                                        NULL,
1903                                        NULL,
1904                                        NULL,
1905                                        /*124*/ NULL,
1906                                        NULL,
1907                                        NULL,
1908                                        NULL,
1909                                        NULL,
1910                                        /*129*/ MatFDColoringSetUp_SeqXAIJ,
1911                                        NULL,
1912                                        NULL,
1913                                        NULL,
1914                                        NULL,
1915                                        /*134*/ NULL,
1916                                        NULL,
1917                                        NULL,
1918                                        NULL,
1919                                        NULL,
1920                                        /*139*/ NULL,
1921                                        NULL,
1922                                        NULL,
1923                                        NULL,
1924                                        NULL,
1925                                        NULL};
1926 
1927 static PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1928 {
1929   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1930 
1931   PetscFunctionBegin;
1932   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1933 
1934   /* allocate space for values if not already there */
1935   if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1936 
1937   /* copy values over */
1938   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1939   PetscFunctionReturn(PETSC_SUCCESS);
1940 }
1941 
1942 static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1943 {
1944   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1945 
1946   PetscFunctionBegin;
1947   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1948   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1949   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1950   PetscFunctionReturn(PETSC_SUCCESS);
1951 }
1952 
1953 static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio)
1954 {
1955   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1956 
1957   PetscFunctionBegin;
1958   if (a->totalslices && a->sliidx[a->totalslices]) {
1959     *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices];
1960   } else {
1961     *ratio = 0.0;
1962   }
1963   PetscFunctionReturn(PETSC_SUCCESS);
1964 }
1965 
1966 static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth)
1967 {
1968   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1969   PetscInt     i, current_slicewidth;
1970 
1971   PetscFunctionBegin;
1972   *slicewidth = 0;
1973   for (i = 0; i < a->totalslices; i++) {
1974     current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight;
1975     if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth;
1976   }
1977   PetscFunctionReturn(PETSC_SUCCESS);
1978 }
1979 
1980 static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth)
1981 {
1982   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1983 
1984   PetscFunctionBegin;
1985   *slicewidth = 0;
1986   if (a->totalslices) *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices;
1987   PetscFunctionReturn(PETSC_SUCCESS);
1988 }
1989 
1990 static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance)
1991 {
1992   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1993   PetscReal    mean;
1994   PetscInt     i, totalslices = a->totalslices, *sliidx = a->sliidx;
1995 
1996   PetscFunctionBegin;
1997   *variance = 0;
1998   if (totalslices) {
1999     mean = (PetscReal)sliidx[totalslices] / totalslices;
2000     for (i = 1; i <= totalslices; i++) *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices;
2001   }
2002   PetscFunctionReturn(PETSC_SUCCESS);
2003 }
2004 
2005 static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight)
2006 {
2007   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2008 
2009   PetscFunctionBegin;
2010   if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS);
2011   PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight);
2012   a->sliceheight = sliceheight;
2013 #if defined(PETSC_HAVE_CUPM)
2014   PetscCheck(PetscMax(DEVICE_MEM_ALIGN, sliceheight) % PetscMin(DEVICE_MEM_ALIGN, sliceheight) == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The slice height is not compatible with DEVICE_MEM_ALIGN (one must be divisible by the other) %" PetscInt_FMT, sliceheight);
2015 #endif
2016   PetscFunctionReturn(PETSC_SUCCESS);
2017 }
2018 
2019 /*@
2020   MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix.
2021 
2022   Not Collective
2023 
2024   Input Parameter:
2025 . A - a MATSEQSELL matrix
2026 
2027   Output Parameter:
2028 . ratio - ratio of number of padded zeros to number of allocated elements
2029 
2030   Level: intermediate
2031 
2032 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2033 @*/
2034 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio)
2035 {
2036   PetscFunctionBegin;
2037   PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio));
2038   PetscFunctionReturn(PETSC_SUCCESS);
2039 }
2040 
2041 /*@
2042   MatSeqSELLGetMaxSliceWidth - returns the maximum slice width.
2043 
2044   Not Collective
2045 
2046   Input Parameter:
2047 . A - a MATSEQSELL matrix
2048 
2049   Output Parameter:
2050 . slicewidth - maximum slice width
2051 
2052   Level: intermediate
2053 
2054 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2055 @*/
2056 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth)
2057 {
2058   PetscFunctionBegin;
2059   PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth));
2060   PetscFunctionReturn(PETSC_SUCCESS);
2061 }
2062 
2063 /*@
2064   MatSeqSELLGetAvgSliceWidth - returns the average slice width.
2065 
2066   Not Collective
2067 
2068   Input Parameter:
2069 . A - a MATSEQSELL matrix
2070 
2071   Output Parameter:
2072 . slicewidth - average slice width
2073 
2074   Level: intermediate
2075 
2076 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()`
2077 @*/
2078 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth)
2079 {
2080   PetscFunctionBegin;
2081   PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth));
2082   PetscFunctionReturn(PETSC_SUCCESS);
2083 }
2084 
2085 /*@
2086   MatSeqSELLSetSliceHeight - sets the slice height.
2087 
2088   Not Collective
2089 
2090   Input Parameters:
2091 + A           - a MATSEQSELL matrix
2092 - sliceheight - slice height
2093 
2094   Notes:
2095   You cannot change the slice height once it have been set.
2096 
2097   The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called.
2098 
2099   Level: intermediate
2100 
2101 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()`
2102 @*/
2103 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight)
2104 {
2105   PetscFunctionBegin;
2106   PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight));
2107   PetscFunctionReturn(PETSC_SUCCESS);
2108 }
2109 
2110 /*@
2111   MatSeqSELLGetVarSliceSize - returns the variance of the slice size.
2112 
2113   Not Collective
2114 
2115   Input Parameter:
2116 . A - a MATSEQSELL matrix
2117 
2118   Output Parameter:
2119 . variance - variance of the slice size
2120 
2121   Level: intermediate
2122 
2123 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()`
2124 @*/
2125 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance)
2126 {
2127   PetscFunctionBegin;
2128   PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance));
2129   PetscFunctionReturn(PETSC_SUCCESS);
2130 }
2131 
2132 #if defined(PETSC_HAVE_CUDA)
2133 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
2134 #endif
2135 #if defined(PETSC_HAVE_HIP)
2136 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat);
2137 #endif
2138 
2139 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
2140 {
2141   Mat_SeqSELL *b;
2142   PetscMPIInt  size;
2143 
2144   PetscFunctionBegin;
2145   PetscCall(PetscCitationsRegister(citation, &cited));
2146   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2147   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
2148 
2149   PetscCall(PetscNew(&b));
2150 
2151   B->data   = (void *)b;
2152   B->ops[0] = MatOps_Values;
2153 
2154   b->row                = NULL;
2155   b->col                = NULL;
2156   b->icol               = NULL;
2157   b->reallocs           = 0;
2158   b->ignorezeroentries  = PETSC_FALSE;
2159   b->roworiented        = PETSC_TRUE;
2160   b->nonew              = 0;
2161   b->diag               = NULL;
2162   b->solve_work         = NULL;
2163   B->spptr              = NULL;
2164   b->saved_values       = NULL;
2165   b->idiag              = NULL;
2166   b->mdiag              = NULL;
2167   b->ssor_work          = NULL;
2168   b->omega              = 1.0;
2169   b->fshift             = 0.0;
2170   b->keepnonzeropattern = PETSC_FALSE;
2171   b->sliceheight        = 0;
2172 
2173   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
2174   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
2175   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
2176   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
2177   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
2178   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
2179   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
2180 #if defined(PETSC_HAVE_CUDA)
2181   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA));
2182 #endif
2183 #if defined(PETSC_HAVE_HIP)
2184   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellhip_C", MatConvert_SeqSELL_SeqSELLHIP));
2185 #endif
2186   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL));
2187   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL));
2188   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL));
2189   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL));
2190   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL));
2191 
2192   PetscObjectOptionsBegin((PetscObject)B);
2193   {
2194     PetscInt  newsh = -1;
2195     PetscBool flg;
2196 #if defined(PETSC_HAVE_CUPM)
2197     PetscInt chunksize = 0;
2198 #endif
2199 
2200     PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg));
2201     if (flg) PetscCall(MatSeqSELLSetSliceHeight(B, newsh));
2202 #if defined(PETSC_HAVE_CUPM)
2203     PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA/HIP kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg));
2204     if (flg) {
2205       PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize);
2206       b->chunksize = chunksize;
2207     }
2208 #endif
2209   }
2210   PetscOptionsEnd();
2211   PetscFunctionReturn(PETSC_SUCCESS);
2212 }
2213 
2214 /*
2215  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2216  */
2217 static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
2218 {
2219   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
2220   PetscInt     i, m                           = A->rmap->n;
2221   PetscInt     totalslices = a->totalslices;
2222 
2223   PetscFunctionBegin;
2224   C->factortype = A->factortype;
2225   c->row        = NULL;
2226   c->col        = NULL;
2227   c->icol       = NULL;
2228   c->reallocs   = 0;
2229   C->assembled  = PETSC_TRUE;
2230 
2231   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2232   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2233 
2234   c->sliceheight = a->sliceheight;
2235   PetscCall(PetscMalloc1(c->sliceheight * totalslices, &c->rlen));
2236   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2237 
2238   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2239   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2240 
2241   /* allocate the matrix space */
2242   if (mallocmatspace) {
2243     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2244 
2245     c->singlemalloc = PETSC_TRUE;
2246 
2247     if (m > 0) {
2248       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2249       if (cpvalues == MAT_COPY_VALUES) {
2250         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2251       } else {
2252         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2253       }
2254     }
2255   }
2256 
2257   c->ignorezeroentries = a->ignorezeroentries;
2258   c->roworiented       = a->roworiented;
2259   c->nonew             = a->nonew;
2260   if (a->diag) {
2261     PetscCall(PetscMalloc1(m, &c->diag));
2262     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2263   } else c->diag = NULL;
2264 
2265   c->solve_work         = NULL;
2266   c->saved_values       = NULL;
2267   c->idiag              = NULL;
2268   c->ssor_work          = NULL;
2269   c->keepnonzeropattern = a->keepnonzeropattern;
2270   c->free_val           = PETSC_TRUE;
2271   c->free_colidx        = PETSC_TRUE;
2272 
2273   c->maxallocmat  = a->maxallocmat;
2274   c->maxallocrow  = a->maxallocrow;
2275   c->rlenmax      = a->rlenmax;
2276   c->nz           = a->nz;
2277   C->preallocated = PETSC_TRUE;
2278 
2279   c->nonzerorowcnt = a->nonzerorowcnt;
2280   C->nonzerostate  = A->nonzerostate;
2281 
2282   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2283   PetscFunctionReturn(PETSC_SUCCESS);
2284 }
2285 
2286 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2287 {
2288   PetscFunctionBegin;
2289   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2290   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2291   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2292   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2293   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2294   PetscFunctionReturn(PETSC_SUCCESS);
2295 }
2296 
2297 /*MC
2298    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2299    based on the sliced Ellpack format, {cite}`zhangellpack2018`
2300 
2301    Options Database Key:
2302 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2303 
2304    Level: beginner
2305 
2306 .seealso: `Mat`, `MatCreateSeqSELL()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2307 M*/
2308 
2309 /*MC
2310    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices, {cite}`zhangellpack2018`
2311 
2312    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2313    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2314   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2315   for communicators controlling multiple processes.  It is recommended that you call both of
2316   the above preallocation routines for simplicity.
2317 
2318    Options Database Key:
2319 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2320 
2321   Level: beginner
2322 
2323   Notes:
2324   This format is only supported for real scalars, double precision, and 32-bit indices (the defaults).
2325 
2326   It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2327   non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2328 
2329   Developer Notes:
2330   On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2331 
2332   The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2333 .vb
2334                             (2 0  3 4)
2335    Consider the matrix A =  (5 0  6 0)
2336                             (0 0  7 8)
2337                             (0 0  9 9)
2338 
2339    symbolically the Ellpack format can be written as
2340 
2341         (2 3 4 |)           (0 2 3 |)
2342    v =  (5 6 0 |)  colidx = (0 2 2 |)
2343         --------            ---------
2344         (7 8 |)             (2 3 |)
2345         (9 9 |)             (2 3 |)
2346 
2347     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2348     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2349     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2350 
2351     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2352 
2353 .ve
2354 
2355     See `MatMult_SeqSELL()` for how this format is used with the SIMD operations to achieve high performance.
2356 
2357 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSELL()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2358 M*/
2359 
2360 /*@
2361   MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2362 
2363   Collective
2364 
2365   Input Parameters:
2366 + comm    - MPI communicator, set to `PETSC_COMM_SELF`
2367 . m       - number of rows
2368 . n       - number of columns
2369 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2370 - rlen    - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2371 
2372   Output Parameter:
2373 . A - the matrix
2374 
2375   Level: intermediate
2376 
2377   Notes:
2378   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2379   MatXXXXSetPreallocation() paradigm instead of this routine directly.
2380   [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2381 
2382   Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2383   Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2384   allocation.
2385 
2386 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL`
2387  @*/
2388 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2389 {
2390   PetscFunctionBegin;
2391   PetscCall(MatCreate(comm, A));
2392   PetscCall(MatSetSizes(*A, m, n, m, n));
2393   PetscCall(MatSetType(*A, MATSEQSELL));
2394   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2395   PetscFunctionReturn(PETSC_SUCCESS);
2396 }
2397 
2398 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2399 {
2400   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2401   PetscInt     totalslices = a->totalslices;
2402 
2403   PetscFunctionBegin;
2404   /* If the  matrix dimensions are not equal,or no of nonzeros */
2405   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2406     *flg = PETSC_FALSE;
2407     PetscFunctionReturn(PETSC_SUCCESS);
2408   }
2409   /* if the a->colidx are the same */
2410   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2411   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2412   /* if a->val are the same */
2413   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2414   PetscFunctionReturn(PETSC_SUCCESS);
2415 }
2416 
2417 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2418 {
2419 #if defined(PETSC_USE_COMPLEX)
2420   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2421   PetscInt     i;
2422   PetscScalar *val = a->val;
2423 
2424   PetscFunctionBegin;
2425   for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]);
2426   #if defined(PETSC_HAVE_CUPM)
2427   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
2428   #endif
2429 #else
2430   PetscFunctionBegin;
2431 #endif
2432   PetscFunctionReturn(PETSC_SUCCESS);
2433 }
2434