xref: /petsc/src/mat/impls/sell/seq/sell.c (revision 90d2215b4f5b865a0e7953289703ce9072a42621)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited      = PETSC_FALSE;
10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14                                " year = 2018\n"
15                                "}\n";
16 
17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
18 
19   #include <immintrin.h>
20 
21   #if !defined(_MM_SCALE_8)
22     #define _MM_SCALE_8 8
23   #endif
24 
25   #if defined(__AVX512F__)
26     /* these do not work
27    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29   */
30     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
33       vec_vals = _mm512_loadu_pd(aval); \
34       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36   #elif defined(__AVX2__) && defined(__FMA__)
37     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38       vec_vals = _mm256_loadu_pd(aval); \
39       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42   #endif
43 #endif /* PETSC_HAVE_IMMINTRIN_H */
44 
45 /*@C
46  MatSeqSELLSetPreallocation - For good matrix assembly performance
47  the user should preallocate the matrix storage by setting the parameter `nz`
48  (or the array `nnz`).
49 
50  Collective
51 
52  Input Parameters:
53 +  B - The `MATSEQSELL` matrix
54 .  rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
55 -  rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
56 
57  Level: intermediate
58 
59  Notes:
60  Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
61  Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
62  allocation.
63 
64  You can call `MatGetInfo()` to get information on how effective the preallocation was;
65  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
66  You can also run with the option `-info` and look for messages with the string
67  malloc in them to see if additional memory allocation was needed.
68 
69  Developer Note:
70  Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
71  entries or columns indices.
72 
73  The maximum number of nonzeos in any row should be as accurate as possible.
74  If it is underestimated, you will get bad performance due to reallocation
75  (`MatSeqXSELLReallocateSELL()`).
76 
77  .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
78  @*/
79 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
80 {
81   PetscFunctionBegin;
82   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
83   PetscValidType(B, 1);
84   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
85   PetscFunctionReturn(PETSC_SUCCESS);
86 }
87 
88 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
89 {
90   Mat_SeqSELL *b;
91   PetscInt     i, j, totalslices;
92 #if defined(PETSC_HAVE_CUDA)
93   PetscInt rlenmax = 0;
94 #endif
95   PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
96 
97   PetscFunctionBegin;
98   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
99   if (maxallocrow == MAT_SKIP_ALLOCATION) {
100     skipallocation = PETSC_TRUE;
101     maxallocrow    = 0;
102   }
103 
104   PetscCall(PetscLayoutSetUp(B->rmap));
105   PetscCall(PetscLayoutSetUp(B->cmap));
106 
107   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
108   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
109   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
110   if (rlen) {
111     for (i = 0; i < B->rmap->n; i++) {
112       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
113       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
114     }
115   }
116 
117   B->preallocated = PETSC_TRUE;
118 
119   b = (Mat_SeqSELL *)B->data;
120 
121   if (!b->sliceheight) { /* not set yet */
122 #if defined(PETSC_HAVE_CUDA)
123     b->sliceheight = 16;
124 #else
125     b->sliceheight = 8;
126 #endif
127   }
128   totalslices    = PetscCeilInt(B->rmap->n, b->sliceheight);
129   b->totalslices = totalslices;
130   if (!skipallocation) {
131     if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n));
132 
133     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
134       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
135     }
136     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
137       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
138       else if (maxallocrow < 0) maxallocrow = 1;
139 #if defined(PETSC_HAVE_CUDA)
140       rlenmax = maxallocrow;
141       /* Pad the slice to DEVICE_MEM_ALIGN */
142       while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++;
143 #endif
144       for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow;
145     } else {
146 #if defined(PETSC_HAVE_CUDA)
147       PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight;
148 #endif
149       maxallocrow  = 0;
150       b->sliidx[0] = 0;
151       for (i = 1; i < totalslices; i++) {
152         b->sliidx[i] = 0;
153         for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); }
154 #if defined(PETSC_HAVE_CUDA)
155         rlenmax = PetscMax(b->sliidx[i], rlenmax);
156         /* Pad the slice to DEVICE_MEM_ALIGN */
157         b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul;
158 #endif
159         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
160         PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i]));
161       }
162       /* last slice */
163       b->sliidx[totalslices] = 0;
164       for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
165 #if defined(PETSC_HAVE_CUDA)
166       rlenmax                = PetscMax(b->sliidx[i], rlenmax);
167       b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul;
168 #endif
169       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
170       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices];
171     }
172 
173     /* allocate space for val, colidx, rlen */
174     /* FIXME: should B's old memory be unlogged? */
175     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
176     /* FIXME: assuming an element of the bit array takes 8 bits */
177     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
178     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
179     PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen));
180 
181     b->singlemalloc = PETSC_TRUE;
182     b->free_val     = PETSC_TRUE;
183     b->free_colidx  = PETSC_TRUE;
184   } else {
185     b->free_val    = PETSC_FALSE;
186     b->free_colidx = PETSC_FALSE;
187   }
188 
189   b->nz          = 0;
190   b->maxallocrow = maxallocrow;
191 #if defined(PETSC_HAVE_CUDA)
192   b->rlenmax = rlenmax;
193 #else
194   b->rlenmax = maxallocrow;
195 #endif
196   b->maxallocmat      = b->sliidx[totalslices];
197   B->info.nz_unneeded = (double)b->maxallocmat;
198   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
199   PetscFunctionReturn(PETSC_SUCCESS);
200 }
201 
202 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
203 {
204   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
205   PetscInt     shift;
206 
207   PetscFunctionBegin;
208   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
209   if (nz) *nz = a->rlen[row];
210   shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight);
211   if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); }
212   if (idx) {
213     PetscInt j;
214     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j];
215     *idx = a->getrowcols;
216   }
217   if (v) {
218     PetscInt j;
219     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j];
220     *v = a->getrowvals;
221   }
222   PetscFunctionReturn(PETSC_SUCCESS);
223 }
224 
225 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
226 {
227   PetscFunctionBegin;
228   PetscFunctionReturn(PETSC_SUCCESS);
229 }
230 
231 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
232 {
233   Mat          B;
234   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
235   PetscInt     i;
236 
237   PetscFunctionBegin;
238   if (reuse == MAT_REUSE_MATRIX) {
239     B = *newmat;
240     PetscCall(MatZeroEntries(B));
241   } else {
242     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
243     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
244     PetscCall(MatSetType(B, MATSEQAIJ));
245     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
246   }
247 
248   for (i = 0; i < A->rmap->n; i++) {
249     PetscInt     nz = 0, *cols = NULL;
250     PetscScalar *vals = NULL;
251 
252     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
253     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
254     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
255   }
256 
257   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
258   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
259   B->rmap->bs = A->rmap->bs;
260 
261   if (reuse == MAT_INPLACE_MATRIX) {
262     PetscCall(MatHeaderReplace(A, &B));
263   } else {
264     *newmat = B;
265   }
266   PetscFunctionReturn(PETSC_SUCCESS);
267 }
268 
269 #include <../src/mat/impls/aij/seq/aij.h>
270 
271 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
272 {
273   Mat                B;
274   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
275   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
276   const PetscInt    *cols;
277   const PetscScalar *vals;
278 
279   PetscFunctionBegin;
280 
281   if (reuse == MAT_REUSE_MATRIX) {
282     B = *newmat;
283   } else {
284     if (PetscDefined(USE_DEBUG) || !a->ilen) {
285       PetscCall(PetscMalloc1(m, &rowlengths));
286       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
287     }
288     if (PetscDefined(USE_DEBUG) && a->ilen) {
289       PetscBool eq;
290       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
291       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
292       PetscCall(PetscFree(rowlengths));
293       rowlengths = a->ilen;
294     } else if (a->ilen) rowlengths = a->ilen;
295     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
296     PetscCall(MatSetSizes(B, m, n, m, n));
297     PetscCall(MatSetType(B, MATSEQSELL));
298     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
299     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
300   }
301 
302   for (row = 0; row < m; row++) {
303     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
304     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
305     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
306   }
307   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
308   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
309   B->rmap->bs = A->rmap->bs;
310 
311   if (reuse == MAT_INPLACE_MATRIX) {
312     PetscCall(MatHeaderReplace(A, &B));
313   } else {
314     *newmat = B;
315   }
316   PetscFunctionReturn(PETSC_SUCCESS);
317 }
318 
319 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
320 {
321   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
322   PetscScalar       *y;
323   const PetscScalar *x;
324   const MatScalar   *aval        = a->val;
325   PetscInt           totalslices = a->totalslices;
326   const PetscInt    *acolidx     = a->colidx;
327   PetscInt           i, j;
328 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
329   __m512d  vec_x, vec_y, vec_vals;
330   __m256i  vec_idx;
331   __mmask8 mask;
332   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
333   __m256i  vec_idx2, vec_idx3, vec_idx4;
334 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
335   __m128i   vec_idx;
336   __m256d   vec_x, vec_y, vec_y2, vec_vals;
337   MatScalar yval;
338   PetscInt  r, rows_left, row, nnz_in_row;
339 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
340   __m128d   vec_x_tmp;
341   __m256d   vec_x, vec_y, vec_y2, vec_vals;
342   MatScalar yval;
343   PetscInt  r, rows_left, row, nnz_in_row;
344 #else
345   PetscInt     k, sliceheight = a->sliceheight;
346   PetscScalar *sum;
347 #endif
348 
349 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
350   #pragma disjoint(*x, *y, *aval)
351 #endif
352 
353   PetscFunctionBegin;
354   PetscCall(VecGetArrayRead(xx, &x));
355   PetscCall(VecGetArray(yy, &y));
356 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
357   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
358   for (i = 0; i < totalslices; i++) { /* loop over slices */
359     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
360     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
361 
362     vec_y  = _mm512_setzero_pd();
363     vec_y2 = _mm512_setzero_pd();
364     vec_y3 = _mm512_setzero_pd();
365     vec_y4 = _mm512_setzero_pd();
366 
367     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
368     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
369     case 3:
370       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
371       acolidx += 8;
372       aval += 8;
373       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
374       acolidx += 8;
375       aval += 8;
376       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
377       acolidx += 8;
378       aval += 8;
379       j += 3;
380       break;
381     case 2:
382       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
383       acolidx += 8;
384       aval += 8;
385       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
386       acolidx += 8;
387       aval += 8;
388       j += 2;
389       break;
390     case 1:
391       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
392       acolidx += 8;
393       aval += 8;
394       j += 1;
395       break;
396     }
397   #pragma novector
398     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
399       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
400       acolidx += 8;
401       aval += 8;
402       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
403       acolidx += 8;
404       aval += 8;
405       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
406       acolidx += 8;
407       aval += 8;
408       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
409       acolidx += 8;
410       aval += 8;
411     }
412 
413     vec_y = _mm512_add_pd(vec_y, vec_y2);
414     vec_y = _mm512_add_pd(vec_y, vec_y3);
415     vec_y = _mm512_add_pd(vec_y, vec_y4);
416     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
417       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
418       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
419     } else {
420       _mm512_storeu_pd(&y[8 * i], vec_y);
421     }
422   }
423 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
424   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
425   for (i = 0; i < totalslices; i++) { /* loop over full slices */
426     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
427     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
428 
429     /* last slice may have padding rows. Don't use vectorization. */
430     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
431       rows_left = A->rmap->n - 8 * i;
432       for (r = 0; r < rows_left; ++r) {
433         yval       = (MatScalar)0;
434         row        = 8 * i + r;
435         nnz_in_row = a->rlen[row];
436         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
437         y[row] = yval;
438       }
439       break;
440     }
441 
442     vec_y  = _mm256_setzero_pd();
443     vec_y2 = _mm256_setzero_pd();
444 
445   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
446   #pragma novector
447   #pragma unroll(2)
448     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
449       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
450       aval += 4;
451       acolidx += 4;
452       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
453       aval += 4;
454       acolidx += 4;
455     }
456 
457     _mm256_storeu_pd(y + i * 8, vec_y);
458     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
459   }
460 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
461   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
462   for (i = 0; i < totalslices; i++) { /* loop over full slices */
463     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
464     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
465 
466     vec_y  = _mm256_setzero_pd();
467     vec_y2 = _mm256_setzero_pd();
468 
469     /* last slice may have padding rows. Don't use vectorization. */
470     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
471       rows_left = A->rmap->n - 8 * i;
472       for (r = 0; r < rows_left; ++r) {
473         yval       = (MatScalar)0;
474         row        = 8 * i + r;
475         nnz_in_row = a->rlen[row];
476         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
477         y[row] = yval;
478       }
479       break;
480     }
481 
482   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
483   #pragma novector
484   #pragma unroll(2)
485     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
486       vec_vals  = _mm256_loadu_pd(aval);
487       vec_x_tmp = _mm_setzero_pd();
488       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
489       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
490       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
491       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
492       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
493       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
494       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
495       aval += 4;
496 
497       vec_vals  = _mm256_loadu_pd(aval);
498       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
499       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
500       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
501       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
502       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
503       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
504       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
505       aval += 4;
506     }
507 
508     _mm256_storeu_pd(y + i * 8, vec_y);
509     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
510   }
511 #else
512   PetscCall(PetscMalloc1(sliceheight, &sum));
513   for (i = 0; i < totalslices; i++) { /* loop over slices */
514     for (j = 0; j < sliceheight; j++) {
515       sum[j] = 0.0;
516       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
517     }
518     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */
519       for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j];
520     } else {
521       for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j];
522     }
523   }
524   PetscCall(PetscFree(sum));
525 #endif
526 
527   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
528   PetscCall(VecRestoreArrayRead(xx, &x));
529   PetscCall(VecRestoreArray(yy, &y));
530   PetscFunctionReturn(PETSC_SUCCESS);
531 }
532 
533 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
534 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
535 {
536   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
537   PetscScalar       *y, *z;
538   const PetscScalar *x;
539   const MatScalar   *aval        = a->val;
540   PetscInt           totalslices = a->totalslices;
541   const PetscInt    *acolidx     = a->colidx;
542   PetscInt           i, j;
543 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
544   __m512d  vec_x, vec_y, vec_vals;
545   __m256i  vec_idx;
546   __mmask8 mask;
547   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
548   __m256i  vec_idx2, vec_idx3, vec_idx4;
549 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
550   __m128d   vec_x_tmp;
551   __m256d   vec_x, vec_y, vec_y2, vec_vals;
552   MatScalar yval;
553   PetscInt  r, row, nnz_in_row;
554 #else
555   PetscInt     k, sliceheight = a->sliceheight;
556   PetscScalar *sum;
557 #endif
558 
559 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
560   #pragma disjoint(*x, *y, *aval)
561 #endif
562 
563   PetscFunctionBegin;
564   if (!a->nz) {
565     PetscCall(VecCopy(yy, zz));
566     PetscFunctionReturn(PETSC_SUCCESS);
567   }
568   PetscCall(VecGetArrayRead(xx, &x));
569   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
570 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
571   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
572   for (i = 0; i < totalslices; i++) { /* loop over slices */
573     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
574     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
575 
576     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
577       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
578       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
579     } else {
580       vec_y = _mm512_loadu_pd(&y[8 * i]);
581     }
582     vec_y2 = _mm512_setzero_pd();
583     vec_y3 = _mm512_setzero_pd();
584     vec_y4 = _mm512_setzero_pd();
585 
586     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
587     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
588     case 3:
589       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
590       acolidx += 8;
591       aval += 8;
592       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
593       acolidx += 8;
594       aval += 8;
595       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
596       acolidx += 8;
597       aval += 8;
598       j += 3;
599       break;
600     case 2:
601       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
602       acolidx += 8;
603       aval += 8;
604       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
605       acolidx += 8;
606       aval += 8;
607       j += 2;
608       break;
609     case 1:
610       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
611       acolidx += 8;
612       aval += 8;
613       j += 1;
614       break;
615     }
616   #pragma novector
617     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
618       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
619       acolidx += 8;
620       aval += 8;
621       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
622       acolidx += 8;
623       aval += 8;
624       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
625       acolidx += 8;
626       aval += 8;
627       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
628       acolidx += 8;
629       aval += 8;
630     }
631 
632     vec_y = _mm512_add_pd(vec_y, vec_y2);
633     vec_y = _mm512_add_pd(vec_y, vec_y3);
634     vec_y = _mm512_add_pd(vec_y, vec_y4);
635     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
636       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
637     } else {
638       _mm512_storeu_pd(&z[8 * i], vec_y);
639     }
640   }
641 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
642   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
643   for (i = 0; i < totalslices; i++) { /* loop over full slices */
644     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
645     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
646 
647     /* last slice may have padding rows. Don't use vectorization. */
648     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
649       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
650         row        = 8 * i + r;
651         yval       = (MatScalar)0.0;
652         nnz_in_row = a->rlen[row];
653         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
654         z[row] = y[row] + yval;
655       }
656       break;
657     }
658 
659     vec_y  = _mm256_loadu_pd(y + 8 * i);
660     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
661 
662     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
663     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
664       vec_vals  = _mm256_loadu_pd(aval);
665       vec_x_tmp = _mm_setzero_pd();
666       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
667       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
668       vec_x     = _mm256_setzero_pd();
669       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
670       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
671       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
672       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
673       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
674       aval += 4;
675 
676       vec_vals  = _mm256_loadu_pd(aval);
677       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
678       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
679       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
680       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
681       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
682       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
683       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
684       aval += 4;
685     }
686 
687     _mm256_storeu_pd(z + i * 8, vec_y);
688     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
689   }
690 #else
691   PetscCall(PetscMalloc1(sliceheight, &sum));
692   for (i = 0; i < totalslices; i++) { /* loop over slices */
693     for (j = 0; j < sliceheight; j++) {
694       sum[j] = 0.0;
695       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
696     }
697     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
698       for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
699     } else {
700       for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
701     }
702   }
703   PetscCall(PetscFree(sum));
704 #endif
705 
706   PetscCall(PetscLogFlops(2.0 * a->nz));
707   PetscCall(VecRestoreArrayRead(xx, &x));
708   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
709   PetscFunctionReturn(PETSC_SUCCESS);
710 }
711 
712 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
713 {
714   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
715   PetscScalar       *y;
716   const PetscScalar *x;
717   const MatScalar   *aval    = a->val;
718   const PetscInt    *acolidx = a->colidx;
719   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight;
720 
721 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
722   #pragma disjoint(*x, *y, *aval)
723 #endif
724 
725   PetscFunctionBegin;
726   if (A->symmetric == PETSC_BOOL3_TRUE) {
727     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
728     PetscFunctionReturn(PETSC_SUCCESS);
729   }
730   if (zz != yy) PetscCall(VecCopy(zz, yy));
731 
732   if (a->nz) {
733     PetscCall(VecGetArrayRead(xx, &x));
734     PetscCall(VecGetArray(yy, &y));
735     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
736       if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
737         for (r = 0; r < (A->rmap->n % sliceheight); ++r) {
738           row        = sliceheight * i + r;
739           nnz_in_row = a->rlen[row];
740           for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row];
741         }
742         break;
743       }
744       for (r = 0; r < sliceheight; ++r)
745         for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r];
746     }
747     PetscCall(PetscLogFlops(2.0 * a->nz));
748     PetscCall(VecRestoreArrayRead(xx, &x));
749     PetscCall(VecRestoreArray(yy, &y));
750   }
751   PetscFunctionReturn(PETSC_SUCCESS);
752 }
753 
754 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
755 {
756   PetscFunctionBegin;
757   if (A->symmetric == PETSC_BOOL3_TRUE) {
758     PetscCall(MatMult_SeqSELL(A, xx, yy));
759   } else {
760     PetscCall(VecSet(yy, 0.0));
761     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
762   }
763   PetscFunctionReturn(PETSC_SUCCESS);
764 }
765 
766 /*
767      Checks for missing diagonals
768 */
769 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
770 {
771   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
772   PetscInt    *diag, i;
773 
774   PetscFunctionBegin;
775   *missing = PETSC_FALSE;
776   if (A->rmap->n > 0 && !(a->colidx)) {
777     *missing = PETSC_TRUE;
778     if (d) *d = 0;
779     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
780   } else {
781     diag = a->diag;
782     for (i = 0; i < A->rmap->n; i++) {
783       if (diag[i] == -1) {
784         *missing = PETSC_TRUE;
785         if (d) *d = i;
786         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
787         break;
788       }
789     }
790   }
791   PetscFunctionReturn(PETSC_SUCCESS);
792 }
793 
794 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
795 {
796   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
797   PetscInt     i, j, m = A->rmap->n, shift;
798 
799   PetscFunctionBegin;
800   if (!a->diag) {
801     PetscCall(PetscMalloc1(m, &a->diag));
802     a->free_diag = PETSC_TRUE;
803   }
804   for (i = 0; i < m; i++) {                                          /* loop over rows */
805     shift      = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
806     a->diag[i] = -1;
807     for (j = 0; j < a->rlen[i]; j++) {
808       if (a->colidx[shift + a->sliceheight * j] == i) {
809         a->diag[i] = shift + a->sliceheight * j;
810         break;
811       }
812     }
813   }
814   PetscFunctionReturn(PETSC_SUCCESS);
815 }
816 
817 /*
818   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
819 */
820 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
821 {
822   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
823   PetscInt     i, *diag, m = A->rmap->n;
824   MatScalar   *val = a->val;
825   PetscScalar *idiag, *mdiag;
826 
827   PetscFunctionBegin;
828   if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS);
829   PetscCall(MatMarkDiagonal_SeqSELL(A));
830   diag = a->diag;
831   if (!a->idiag) {
832     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
833     val = a->val;
834   }
835   mdiag = a->mdiag;
836   idiag = a->idiag;
837 
838   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
839     for (i = 0; i < m; i++) {
840       mdiag[i] = val[diag[i]];
841       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
842         PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
843         PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
844         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
845         A->factorerror_zeropivot_value = 0.0;
846         A->factorerror_zeropivot_row   = i;
847       }
848       idiag[i] = 1.0 / val[diag[i]];
849     }
850     PetscCall(PetscLogFlops(m));
851   } else {
852     for (i = 0; i < m; i++) {
853       mdiag[i] = val[diag[i]];
854       idiag[i] = omega / (fshift + val[diag[i]]);
855     }
856     PetscCall(PetscLogFlops(2.0 * m));
857   }
858   a->idiagvalid = PETSC_TRUE;
859   PetscFunctionReturn(PETSC_SUCCESS);
860 }
861 
862 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
863 {
864   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
865 
866   PetscFunctionBegin;
867   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
868   PetscCall(MatSeqSELLInvalidateDiagonal(A));
869   PetscFunctionReturn(PETSC_SUCCESS);
870 }
871 
872 PetscErrorCode MatDestroy_SeqSELL(Mat A)
873 {
874   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
875 
876   PetscFunctionBegin;
877 #if defined(PETSC_USE_LOG)
878   PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
879 #endif
880   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
881   PetscCall(ISDestroy(&a->row));
882   PetscCall(ISDestroy(&a->col));
883   PetscCall(PetscFree(a->diag));
884   PetscCall(PetscFree(a->rlen));
885   PetscCall(PetscFree(a->sliidx));
886   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
887   PetscCall(PetscFree(a->solve_work));
888   PetscCall(ISDestroy(&a->icol));
889   PetscCall(PetscFree(a->saved_values));
890   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
891   PetscCall(PetscFree(A->data));
892 #if defined(PETSC_HAVE_CUDA)
893   PetscCall(PetscFree(a->chunk_slice_map));
894 #endif
895 
896   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
897   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
898   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
899   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
900   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
901   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
902   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_SeqSELL_SeqAIJ_C", NULL));
903 #if defined(PETSC_HAVE_CUDA)
904   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_SeqSELL_SeqSELLCUDA_C", NULL));
905 #endif
906   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL));
907   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL));
908   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL));
909   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL));
910   PetscFunctionReturn(PETSC_SUCCESS);
911 }
912 
913 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
914 {
915   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
916 
917   PetscFunctionBegin;
918   switch (op) {
919   case MAT_ROW_ORIENTED:
920     a->roworiented = flg;
921     break;
922   case MAT_KEEP_NONZERO_PATTERN:
923     a->keepnonzeropattern = flg;
924     break;
925   case MAT_NEW_NONZERO_LOCATIONS:
926     a->nonew = (flg ? 0 : 1);
927     break;
928   case MAT_NEW_NONZERO_LOCATION_ERR:
929     a->nonew = (flg ? -1 : 0);
930     break;
931   case MAT_NEW_NONZERO_ALLOCATION_ERR:
932     a->nonew = (flg ? -2 : 0);
933     break;
934   case MAT_UNUSED_NONZERO_LOCATION_ERR:
935     a->nounused = (flg ? -1 : 0);
936     break;
937   case MAT_FORCE_DIAGONAL_ENTRIES:
938   case MAT_IGNORE_OFF_PROC_ENTRIES:
939   case MAT_USE_HASH_TABLE:
940   case MAT_SORTED_FULL:
941     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
942     break;
943   case MAT_SPD:
944   case MAT_SYMMETRIC:
945   case MAT_STRUCTURALLY_SYMMETRIC:
946   case MAT_HERMITIAN:
947   case MAT_SYMMETRY_ETERNAL:
948   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
949   case MAT_SPD_ETERNAL:
950     /* These options are handled directly by MatSetOption() */
951     break;
952   default:
953     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
954   }
955   PetscFunctionReturn(PETSC_SUCCESS);
956 }
957 
958 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
959 {
960   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
961   PetscInt     i, j, n, shift;
962   PetscScalar *x, zero = 0.0;
963 
964   PetscFunctionBegin;
965   PetscCall(VecGetLocalSize(v, &n));
966   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
967 
968   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
969     PetscInt *diag = a->diag;
970     PetscCall(VecGetArray(v, &x));
971     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
972     PetscCall(VecRestoreArray(v, &x));
973     PetscFunctionReturn(PETSC_SUCCESS);
974   }
975 
976   PetscCall(VecSet(v, zero));
977   PetscCall(VecGetArray(v, &x));
978   for (i = 0; i < n; i++) {                                     /* loop over rows */
979     shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
980     x[i]  = 0;
981     for (j = 0; j < a->rlen[i]; j++) {
982       if (a->colidx[shift + a->sliceheight * j] == i) {
983         x[i] = a->val[shift + a->sliceheight * j];
984         break;
985       }
986     }
987   }
988   PetscCall(VecRestoreArray(v, &x));
989   PetscFunctionReturn(PETSC_SUCCESS);
990 }
991 
992 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
993 {
994   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
995   const PetscScalar *l, *r;
996   PetscInt           i, j, m, n, row;
997 
998   PetscFunctionBegin;
999   if (ll) {
1000     /* The local size is used so that VecMPI can be passed to this routine
1001        by MatDiagonalScale_MPISELL */
1002     PetscCall(VecGetLocalSize(ll, &m));
1003     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1004     PetscCall(VecGetArrayRead(ll, &l));
1005     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1006       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1007         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1008           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row];
1009         }
1010       } else {
1011         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; }
1012       }
1013     }
1014     PetscCall(VecRestoreArrayRead(ll, &l));
1015     PetscCall(PetscLogFlops(a->nz));
1016   }
1017   if (rr) {
1018     PetscCall(VecGetLocalSize(rr, &n));
1019     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1020     PetscCall(VecGetArrayRead(rr, &r));
1021     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1022       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1023         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) {
1024           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]];
1025         }
1026       } else {
1027         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
1028       }
1029     }
1030     PetscCall(VecRestoreArrayRead(rr, &r));
1031     PetscCall(PetscLogFlops(a->nz));
1032   }
1033   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1034 #if defined(PETSC_HAVE_CUDA)
1035   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1036 #endif
1037   PetscFunctionReturn(PETSC_SUCCESS);
1038 }
1039 
1040 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1041 {
1042   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1043   PetscInt    *cp, i, k, low, high, t, row, col, l;
1044   PetscInt     shift;
1045   MatScalar   *vp;
1046 
1047   PetscFunctionBegin;
1048   for (k = 0; k < m; k++) { /* loop over requested rows */
1049     row = im[k];
1050     if (row < 0) continue;
1051     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1052     shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */
1053     cp    = a->colidx + shift;                                        /* pointer to the row */
1054     vp    = a->val + shift;                                           /* pointer to the row */
1055     for (l = 0; l < n; l++) {                                         /* loop over requested columns */
1056       col = in[l];
1057       if (col < 0) continue;
1058       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1059       high = a->rlen[row];
1060       low  = 0; /* assume unsorted */
1061       while (high - low > 5) {
1062         t = (low + high) / 2;
1063         if (*(cp + a->sliceheight * t) > col) high = t;
1064         else low = t;
1065       }
1066       for (i = low; i < high; i++) {
1067         if (*(cp + a->sliceheight * i) > col) break;
1068         if (*(cp + a->sliceheight * i) == col) {
1069           *v++ = *(vp + a->sliceheight * i);
1070           goto finished;
1071         }
1072       }
1073       *v++ = 0.0;
1074     finished:;
1075     }
1076   }
1077   PetscFunctionReturn(PETSC_SUCCESS);
1078 }
1079 
1080 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1081 {
1082   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1083   PetscInt          i, j, m = A->rmap->n, shift;
1084   const char       *name;
1085   PetscViewerFormat format;
1086 
1087   PetscFunctionBegin;
1088   PetscCall(PetscViewerGetFormat(viewer, &format));
1089   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1090     PetscInt nofinalvalue = 0;
1091     /*
1092     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1093       nofinalvalue = 1;
1094     }
1095     */
1096     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1097     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1098     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1099 #if defined(PETSC_USE_COMPLEX)
1100     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1101 #else
1102     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1103 #endif
1104     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1105 
1106     for (i = 0; i < m; i++) {
1107       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1108       for (j = 0; j < a->rlen[i]; j++) {
1109 #if defined(PETSC_USE_COMPLEX)
1110         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1111 #else
1112         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j]));
1113 #endif
1114       }
1115     }
1116     /*
1117     if (nofinalvalue) {
1118 #if defined(PETSC_USE_COMPLEX)
1119       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1120 #else
1121       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1122 #endif
1123     }
1124     */
1125     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1126     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1127     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1128   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1129     PetscFunctionReturn(PETSC_SUCCESS);
1130   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1131     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1132     for (i = 0; i < m; i++) {
1133       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1134       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1135       for (j = 0; j < a->rlen[i]; j++) {
1136 #if defined(PETSC_USE_COMPLEX)
1137         if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1138           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1139         } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1140           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1141         } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1142           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1143         }
1144 #else
1145         if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1146 #endif
1147       }
1148       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1149     }
1150     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1151   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1152     PetscInt    cnt = 0, jcnt;
1153     PetscScalar value;
1154 #if defined(PETSC_USE_COMPLEX)
1155     PetscBool realonly = PETSC_TRUE;
1156     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1157       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1158         realonly = PETSC_FALSE;
1159         break;
1160       }
1161     }
1162 #endif
1163 
1164     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1165     for (i = 0; i < m; i++) {
1166       jcnt  = 0;
1167       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1168       for (j = 0; j < A->cmap->n; j++) {
1169         if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) {
1170           value = a->val[cnt++];
1171           jcnt++;
1172         } else {
1173           value = 0.0;
1174         }
1175 #if defined(PETSC_USE_COMPLEX)
1176         if (realonly) {
1177           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1178         } else {
1179           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1180         }
1181 #else
1182         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1183 #endif
1184       }
1185       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1186     }
1187     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1188   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1189     PetscInt fshift = 1;
1190     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1191 #if defined(PETSC_USE_COMPLEX)
1192     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1193 #else
1194     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1195 #endif
1196     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1197     for (i = 0; i < m; i++) {
1198       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1199       for (j = 0; j < a->rlen[i]; j++) {
1200 #if defined(PETSC_USE_COMPLEX)
1201         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1202 #else
1203         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j]));
1204 #endif
1205       }
1206     }
1207     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1208   } else if (format == PETSC_VIEWER_NATIVE) {
1209     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1210       PetscInt row;
1211       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1212       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1213 #if defined(PETSC_USE_COMPLEX)
1214         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1215           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1216         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1217           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1218         } else {
1219           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1220         }
1221 #else
1222         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j]));
1223 #endif
1224       }
1225     }
1226   } else {
1227     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1228     if (A->factortype) {
1229       for (i = 0; i < m; i++) {
1230         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1231         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1232         /* L part */
1233         for (j = shift; j < a->diag[i]; j += a->sliceheight) {
1234 #if defined(PETSC_USE_COMPLEX)
1235           if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) {
1236             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1237           } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) {
1238             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1239           } else {
1240             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1241           }
1242 #else
1243           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1244 #endif
1245         }
1246         /* diagonal */
1247         j = a->diag[i];
1248 #if defined(PETSC_USE_COMPLEX)
1249         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1250           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1251         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1252           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1253         } else {
1254           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1255         }
1256 #else
1257         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1258 #endif
1259 
1260         /* U part */
1261         for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) {
1262 #if defined(PETSC_USE_COMPLEX)
1263           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1264             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1265           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1266             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1267           } else {
1268             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1269           }
1270 #else
1271           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1272 #endif
1273         }
1274         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1275       }
1276     } else {
1277       for (i = 0; i < m; i++) {
1278         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1279         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1280         for (j = 0; j < a->rlen[i]; j++) {
1281 #if defined(PETSC_USE_COMPLEX)
1282           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1283             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1284           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1285             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1286           } else {
1287             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1288           }
1289 #else
1290           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1291 #endif
1292         }
1293         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1294       }
1295     }
1296     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1297   }
1298   PetscCall(PetscViewerFlush(viewer));
1299   PetscFunctionReturn(PETSC_SUCCESS);
1300 }
1301 
1302 #include <petscdraw.h>
1303 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1304 {
1305   Mat               A = (Mat)Aa;
1306   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1307   PetscInt          i, j, m = A->rmap->n, shift;
1308   int               color;
1309   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1310   PetscViewer       viewer;
1311   PetscViewerFormat format;
1312 
1313   PetscFunctionBegin;
1314   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1315   PetscCall(PetscViewerGetFormat(viewer, &format));
1316   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1317 
1318   /* loop over matrix elements drawing boxes */
1319 
1320   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1321     PetscDrawCollectiveBegin(draw);
1322     /* Blue for negative, Cyan for zero and  Red for positive */
1323     color = PETSC_DRAW_BLUE;
1324     for (i = 0; i < m; i++) {
1325       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1326       y_l   = m - i - 1.0;
1327       y_r   = y_l + 1.0;
1328       for (j = 0; j < a->rlen[i]; j++) {
1329         x_l = a->colidx[shift + a->sliceheight * j];
1330         x_r = x_l + 1.0;
1331         if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue;
1332         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1333       }
1334     }
1335     color = PETSC_DRAW_CYAN;
1336     for (i = 0; i < m; i++) {
1337       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1338       y_l   = m - i - 1.0;
1339       y_r   = y_l + 1.0;
1340       for (j = 0; j < a->rlen[i]; j++) {
1341         x_l = a->colidx[shift + a->sliceheight * j];
1342         x_r = x_l + 1.0;
1343         if (a->val[shift + a->sliceheight * j] != 0.) continue;
1344         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1345       }
1346     }
1347     color = PETSC_DRAW_RED;
1348     for (i = 0; i < m; i++) {
1349       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1350       y_l   = m - i - 1.0;
1351       y_r   = y_l + 1.0;
1352       for (j = 0; j < a->rlen[i]; j++) {
1353         x_l = a->colidx[shift + a->sliceheight * j];
1354         x_r = x_l + 1.0;
1355         if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue;
1356         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1357       }
1358     }
1359     PetscDrawCollectiveEnd(draw);
1360   } else {
1361     /* use contour shading to indicate magnitude of values */
1362     /* first determine max of all nonzero values */
1363     PetscReal minv = 0.0, maxv = 0.0;
1364     PetscInt  count = 0;
1365     PetscDraw popup;
1366     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1367       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1368     }
1369     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1370     PetscCall(PetscDrawGetPopup(draw, &popup));
1371     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1372 
1373     PetscDrawCollectiveBegin(draw);
1374     for (i = 0; i < m; i++) {
1375       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1376       y_l   = m - i - 1.0;
1377       y_r   = y_l + 1.0;
1378       for (j = 0; j < a->rlen[i]; j++) {
1379         x_l   = a->colidx[shift + a->sliceheight * j];
1380         x_r   = x_l + 1.0;
1381         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1382         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1383         count++;
1384       }
1385     }
1386     PetscDrawCollectiveEnd(draw);
1387   }
1388   PetscFunctionReturn(PETSC_SUCCESS);
1389 }
1390 
1391 #include <petscdraw.h>
1392 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1393 {
1394   PetscDraw draw;
1395   PetscReal xr, yr, xl, yl, h, w;
1396   PetscBool isnull;
1397 
1398   PetscFunctionBegin;
1399   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1400   PetscCall(PetscDrawIsNull(draw, &isnull));
1401   if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1402 
1403   xr = A->cmap->n;
1404   yr = A->rmap->n;
1405   h  = yr / 10.0;
1406   w  = xr / 10.0;
1407   xr += w;
1408   yr += h;
1409   xl = -w;
1410   yl = -h;
1411   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1412   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1413   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1414   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1415   PetscCall(PetscDrawSave(draw));
1416   PetscFunctionReturn(PETSC_SUCCESS);
1417 }
1418 
1419 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1420 {
1421   PetscBool iascii, isbinary, isdraw;
1422 
1423   PetscFunctionBegin;
1424   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1425   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1426   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1427   if (iascii) {
1428     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1429   } else if (isbinary) {
1430     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1431   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1432   PetscFunctionReturn(PETSC_SUCCESS);
1433 }
1434 
1435 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1436 {
1437   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1438   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1439   MatScalar   *vp;
1440 #if defined(PETSC_HAVE_CUDA)
1441   PetscInt totalchunks = 0;
1442 #endif
1443 
1444   PetscFunctionBegin;
1445   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1446   /* To do: compress out the unused elements */
1447   PetscCall(MatMarkDiagonal_SeqSELL(A));
1448   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1449   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1450   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1451   a->nonzerorowcnt = 0;
1452   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1453   for (i = 0; i < a->totalslices; ++i) {
1454     shift = a->sliidx[i];                                                   /* starting index of the slice */
1455     cp    = a->colidx + shift;                                              /* pointer to the column indices of the slice */
1456     vp    = a->val + shift;                                                 /* pointer to the nonzero values of the slice */
1457     for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */
1458       row  = a->sliceheight * i + row_in_slice;
1459       nrow = a->rlen[row]; /* number of nonzeros in row */
1460       /*
1461         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1462         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1463       */
1464       lastcol = 0;
1465       if (nrow > 0) { /* nonempty row */
1466         a->nonzerorowcnt++;
1467         lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1468       } else if (!row_in_slice) {                                 /* first row of the correct slice is empty */
1469         for (j = 1; j < a->sliceheight; j++) {
1470           if (a->rlen[a->sliceheight * i + j]) {
1471             lastcol = cp[j];
1472             break;
1473           }
1474         }
1475       } else {
1476         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1477       }
1478 
1479       for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) {
1480         cp[a->sliceheight * k + row_in_slice] = lastcol;
1481         vp[a->sliceheight * k + row_in_slice] = (MatScalar)0;
1482       }
1483     }
1484   }
1485 
1486   A->info.mallocs += a->reallocs;
1487   a->reallocs = 0;
1488 
1489   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1490 #if defined(PETSC_HAVE_CUDA)
1491   if (!a->chunksize && a->totalslices) {
1492     a->chunksize = 64;
1493     while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2;
1494     totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize;
1495   }
1496   if (totalchunks != a->totalchunks) {
1497     PetscCall(PetscFree(a->chunk_slice_map));
1498     PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map));
1499     a->totalchunks = totalchunks;
1500   }
1501   j = 0;
1502   for (i = 0; i < totalchunks; i++) {
1503     while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++;
1504     a->chunk_slice_map[i] = j;
1505   }
1506 #endif
1507   PetscFunctionReturn(PETSC_SUCCESS);
1508 }
1509 
1510 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1511 {
1512   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1513 
1514   PetscFunctionBegin;
1515   info->block_size   = 1.0;
1516   info->nz_allocated = a->maxallocmat;
1517   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1518   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1519   info->assemblies   = A->num_ass;
1520   info->mallocs      = A->info.mallocs;
1521   info->memory       = 0; /* REVIEW ME */
1522   if (A->factortype) {
1523     info->fill_ratio_given  = A->info.fill_ratio_given;
1524     info->fill_ratio_needed = A->info.fill_ratio_needed;
1525     info->factor_mallocs    = A->info.factor_mallocs;
1526   } else {
1527     info->fill_ratio_given  = 0;
1528     info->fill_ratio_needed = 0;
1529     info->factor_mallocs    = 0;
1530   }
1531   PetscFunctionReturn(PETSC_SUCCESS);
1532 }
1533 
1534 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1535 {
1536   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1537   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1538   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1539   MatScalar   *vp, value;
1540 #if defined(PETSC_HAVE_CUDA)
1541   PetscBool inserted = PETSC_FALSE;
1542   PetscInt  mul      = DEVICE_MEM_ALIGN / a->sliceheight;
1543 #endif
1544 
1545   PetscFunctionBegin;
1546   for (k = 0; k < m; k++) { /* loop over added rows */
1547     row = im[k];
1548     if (row < 0) continue;
1549     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1550     shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */
1551     cp    = a->colidx + shift;                                      /* pointer to the row */
1552     vp    = a->val + shift;                                         /* pointer to the row */
1553     nrow  = a->rlen[row];
1554     low   = 0;
1555     high  = nrow;
1556 
1557     for (l = 0; l < n; l++) { /* loop over added columns */
1558       col = in[l];
1559       if (col < 0) continue;
1560       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1561       if (a->roworiented) {
1562         value = v[l + k * n];
1563       } else {
1564         value = v[k + l * m];
1565       }
1566       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1567 
1568       /* search in this row for the specified column, i indicates the column to be set */
1569       if (col <= lastcol) low = 0;
1570       else high = nrow;
1571       lastcol = col;
1572       while (high - low > 5) {
1573         t = (low + high) / 2;
1574         if (*(cp + a->sliceheight * t) > col) high = t;
1575         else low = t;
1576       }
1577       for (i = low; i < high; i++) {
1578         if (*(cp + a->sliceheight * i) > col) break;
1579         if (*(cp + a->sliceheight * i) == col) {
1580           if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value;
1581           else *(vp + a->sliceheight * i) = value;
1582 #if defined(PETSC_HAVE_CUDA)
1583           inserted = PETSC_TRUE;
1584 #endif
1585           low = i + 1;
1586           goto noinsert;
1587         }
1588       }
1589       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1590       if (nonew == 1) goto noinsert;
1591       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1592 #if defined(PETSC_HAVE_CUDA)
1593       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul);
1594 #else
1595       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1596       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1);
1597 #endif
1598       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1599       for (ii = nrow - 1; ii >= i; ii--) {
1600         *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii);
1601         *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii);
1602       }
1603       a->rlen[row]++;
1604       *(cp + a->sliceheight * i) = col;
1605       *(vp + a->sliceheight * i) = value;
1606       a->nz++;
1607       A->nonzerostate++;
1608 #if defined(PETSC_HAVE_CUDA)
1609       inserted = PETSC_TRUE;
1610 #endif
1611       low = i + 1;
1612       high++;
1613       nrow++;
1614     noinsert:;
1615     }
1616     a->rlen[row] = nrow;
1617   }
1618 #if defined(PETSC_HAVE_CUDA)
1619   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU;
1620 #endif
1621   PetscFunctionReturn(PETSC_SUCCESS);
1622 }
1623 
1624 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1625 {
1626   PetscFunctionBegin;
1627   /* If the two matrices have the same copy implementation, use fast copy. */
1628   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1629     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1630     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1631 
1632     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1633     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1634   } else {
1635     PetscCall(MatCopy_Basic(A, B, str));
1636   }
1637   PetscFunctionReturn(PETSC_SUCCESS);
1638 }
1639 
1640 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1641 {
1642   PetscFunctionBegin;
1643   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1644   PetscFunctionReturn(PETSC_SUCCESS);
1645 }
1646 
1647 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1648 {
1649   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1650 
1651   PetscFunctionBegin;
1652   *array = a->val;
1653   PetscFunctionReturn(PETSC_SUCCESS);
1654 }
1655 
1656 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1657 {
1658   PetscFunctionBegin;
1659   PetscFunctionReturn(PETSC_SUCCESS);
1660 }
1661 
1662 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1663 {
1664   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1665   PetscInt     i;
1666   MatScalar   *aval = a->val;
1667 
1668   PetscFunctionBegin;
1669   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1670 #if defined(PETSC_HAVE_CUDA)
1671   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1672 #endif
1673   PetscFunctionReturn(PETSC_SUCCESS);
1674 }
1675 
1676 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1677 {
1678   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1679   PetscInt     i;
1680   MatScalar   *aval = a->val;
1681 
1682   PetscFunctionBegin;
1683   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1684   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1685 #if defined(PETSC_HAVE_CUDA)
1686   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1687 #endif
1688   PetscFunctionReturn(PETSC_SUCCESS);
1689 }
1690 
1691 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1692 {
1693   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1694   MatScalar   *aval   = a->val;
1695   PetscScalar  oalpha = alpha;
1696   PetscBLASInt one    = 1, size;
1697 
1698   PetscFunctionBegin;
1699   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1700   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1701   PetscCall(PetscLogFlops(a->nz));
1702   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1703 #if defined(PETSC_HAVE_CUDA)
1704   if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU;
1705 #endif
1706   PetscFunctionReturn(PETSC_SUCCESS);
1707 }
1708 
1709 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1710 {
1711   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1712 
1713   PetscFunctionBegin;
1714   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1715   PetscCall(MatShift_Basic(Y, a));
1716   PetscFunctionReturn(PETSC_SUCCESS);
1717 }
1718 
1719 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1720 {
1721   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1722   PetscScalar       *x, sum, *t;
1723   const MatScalar   *idiag = NULL, *mdiag;
1724   const PetscScalar *b, *xb;
1725   PetscInt           n, m = A->rmap->n, i, j, shift;
1726   const PetscInt    *diag;
1727 
1728   PetscFunctionBegin;
1729   its = its * lits;
1730 
1731   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1732   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1733   a->fshift = fshift;
1734   a->omega  = omega;
1735 
1736   diag  = a->diag;
1737   t     = a->ssor_work;
1738   idiag = a->idiag;
1739   mdiag = a->mdiag;
1740 
1741   PetscCall(VecGetArray(xx, &x));
1742   PetscCall(VecGetArrayRead(bb, &b));
1743   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1744   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1745   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1746   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1747 
1748   if (flag & SOR_ZERO_INITIAL_GUESS) {
1749     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1750       for (i = 0; i < m; i++) {
1751         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1752         sum   = b[i];
1753         n     = (diag[i] - shift) / a->sliceheight;
1754         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1755         t[i] = sum;
1756         x[i] = sum * idiag[i];
1757       }
1758       xb = t;
1759       PetscCall(PetscLogFlops(a->nz));
1760     } else xb = b;
1761     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1762       for (i = m - 1; i >= 0; i--) {
1763         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1764         sum   = xb[i];
1765         n     = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1766         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1767         if (xb == b) {
1768           x[i] = sum * idiag[i];
1769         } else {
1770           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1771         }
1772       }
1773       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1774     }
1775     its--;
1776   }
1777   while (its--) {
1778     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1779       for (i = 0; i < m; i++) {
1780         /* lower */
1781         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1782         sum   = b[i];
1783         n     = (diag[i] - shift) / a->sliceheight;
1784         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1785         t[i] = sum; /* save application of the lower-triangular part */
1786         /* upper */
1787         n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1788         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1789         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1790       }
1791       xb = t;
1792       PetscCall(PetscLogFlops(2.0 * a->nz));
1793     } else xb = b;
1794     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1795       for (i = m - 1; i >= 0; i--) {
1796         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1797         sum   = xb[i];
1798         if (xb == b) {
1799           /* whole matrix (no checkpointing available) */
1800           n = a->rlen[i];
1801           for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1802           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1803         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1804           n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1805           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1806           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1807         }
1808       }
1809       if (xb == b) {
1810         PetscCall(PetscLogFlops(2.0 * a->nz));
1811       } else {
1812         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1813       }
1814     }
1815   }
1816   PetscCall(VecRestoreArray(xx, &x));
1817   PetscCall(VecRestoreArrayRead(bb, &b));
1818   PetscFunctionReturn(PETSC_SUCCESS);
1819 }
1820 
1821 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1822                                        MatGetRow_SeqSELL,
1823                                        MatRestoreRow_SeqSELL,
1824                                        MatMult_SeqSELL,
1825                                        /* 4*/ MatMultAdd_SeqSELL,
1826                                        MatMultTranspose_SeqSELL,
1827                                        MatMultTransposeAdd_SeqSELL,
1828                                        NULL,
1829                                        NULL,
1830                                        NULL,
1831                                        /* 10*/ NULL,
1832                                        NULL,
1833                                        NULL,
1834                                        MatSOR_SeqSELL,
1835                                        NULL,
1836                                        /* 15*/ MatGetInfo_SeqSELL,
1837                                        MatEqual_SeqSELL,
1838                                        MatGetDiagonal_SeqSELL,
1839                                        MatDiagonalScale_SeqSELL,
1840                                        NULL,
1841                                        /* 20*/ NULL,
1842                                        MatAssemblyEnd_SeqSELL,
1843                                        MatSetOption_SeqSELL,
1844                                        MatZeroEntries_SeqSELL,
1845                                        /* 24*/ NULL,
1846                                        NULL,
1847                                        NULL,
1848                                        NULL,
1849                                        NULL,
1850                                        /* 29*/ MatSetUp_SeqSELL,
1851                                        NULL,
1852                                        NULL,
1853                                        NULL,
1854                                        NULL,
1855                                        /* 34*/ MatDuplicate_SeqSELL,
1856                                        NULL,
1857                                        NULL,
1858                                        NULL,
1859                                        NULL,
1860                                        /* 39*/ NULL,
1861                                        NULL,
1862                                        NULL,
1863                                        MatGetValues_SeqSELL,
1864                                        MatCopy_SeqSELL,
1865                                        /* 44*/ NULL,
1866                                        MatScale_SeqSELL,
1867                                        MatShift_SeqSELL,
1868                                        NULL,
1869                                        NULL,
1870                                        /* 49*/ NULL,
1871                                        NULL,
1872                                        NULL,
1873                                        NULL,
1874                                        NULL,
1875                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1876                                        NULL,
1877                                        NULL,
1878                                        NULL,
1879                                        NULL,
1880                                        /* 59*/ NULL,
1881                                        MatDestroy_SeqSELL,
1882                                        MatView_SeqSELL,
1883                                        NULL,
1884                                        NULL,
1885                                        /* 64*/ NULL,
1886                                        NULL,
1887                                        NULL,
1888                                        NULL,
1889                                        NULL,
1890                                        /* 69*/ NULL,
1891                                        NULL,
1892                                        NULL,
1893                                        NULL,
1894                                        NULL,
1895                                        /* 74*/ NULL,
1896                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1897                                        NULL,
1898                                        NULL,
1899                                        NULL,
1900                                        /* 79*/ NULL,
1901                                        NULL,
1902                                        NULL,
1903                                        NULL,
1904                                        NULL,
1905                                        /* 84*/ NULL,
1906                                        NULL,
1907                                        NULL,
1908                                        NULL,
1909                                        NULL,
1910                                        /* 89*/ NULL,
1911                                        NULL,
1912                                        NULL,
1913                                        NULL,
1914                                        NULL,
1915                                        /* 94*/ NULL,
1916                                        NULL,
1917                                        NULL,
1918                                        NULL,
1919                                        NULL,
1920                                        /* 99*/ NULL,
1921                                        NULL,
1922                                        NULL,
1923                                        MatConjugate_SeqSELL,
1924                                        NULL,
1925                                        /*104*/ NULL,
1926                                        NULL,
1927                                        NULL,
1928                                        NULL,
1929                                        NULL,
1930                                        /*109*/ NULL,
1931                                        NULL,
1932                                        NULL,
1933                                        NULL,
1934                                        MatMissingDiagonal_SeqSELL,
1935                                        /*114*/ NULL,
1936                                        NULL,
1937                                        NULL,
1938                                        NULL,
1939                                        NULL,
1940                                        /*119*/ NULL,
1941                                        NULL,
1942                                        NULL,
1943                                        NULL,
1944                                        NULL,
1945                                        /*124*/ NULL,
1946                                        NULL,
1947                                        NULL,
1948                                        NULL,
1949                                        NULL,
1950                                        /*129*/ NULL,
1951                                        NULL,
1952                                        NULL,
1953                                        NULL,
1954                                        NULL,
1955                                        /*134*/ NULL,
1956                                        NULL,
1957                                        NULL,
1958                                        NULL,
1959                                        NULL,
1960                                        /*139*/ NULL,
1961                                        NULL,
1962                                        NULL,
1963                                        MatFDColoringSetUp_SeqXAIJ,
1964                                        NULL,
1965                                        /*144*/ NULL,
1966                                        NULL,
1967                                        NULL,
1968                                        NULL,
1969                                        NULL,
1970                                        NULL,
1971                                        /*150*/ NULL,
1972                                        NULL};
1973 
1974 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1975 {
1976   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1977 
1978   PetscFunctionBegin;
1979   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1980 
1981   /* allocate space for values if not already there */
1982   if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1983 
1984   /* copy values over */
1985   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1986   PetscFunctionReturn(PETSC_SUCCESS);
1987 }
1988 
1989 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1990 {
1991   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1992 
1993   PetscFunctionBegin;
1994   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1995   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1996   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1997   PetscFunctionReturn(PETSC_SUCCESS);
1998 }
1999 
2000 PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio)
2001 {
2002   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2003 
2004   PetscFunctionBegin;
2005   if (a->totalslices && a->sliidx[a->totalslices]) {
2006     *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices];
2007   } else {
2008     *ratio = 0.0;
2009   }
2010   PetscFunctionReturn(PETSC_SUCCESS);
2011 }
2012 
2013 PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth)
2014 {
2015   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2016   PetscInt     i, current_slicewidth;
2017 
2018   PetscFunctionBegin;
2019   *slicewidth = 0;
2020   for (i = 0; i < a->totalslices; i++) {
2021     current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight;
2022     if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth;
2023   }
2024   PetscFunctionReturn(PETSC_SUCCESS);
2025 }
2026 
2027 PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth)
2028 {
2029   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2030 
2031   PetscFunctionBegin;
2032   *slicewidth = 0;
2033   if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; }
2034   PetscFunctionReturn(PETSC_SUCCESS);
2035 }
2036 
2037 PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight)
2038 {
2039   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2040 
2041   PetscFunctionBegin;
2042   if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS);
2043   PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight);
2044   a->sliceheight = sliceheight;
2045 #if defined(PETSC_HAVE_CUDA)
2046   PetscCheck(DEVICE_MEM_ALIGN % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "DEVICE_MEM_ALIGN is not divisible by the slice height %" PetscInt_FMT, sliceheight);
2047 #endif
2048   PetscFunctionReturn(PETSC_SUCCESS);
2049 }
2050 
2051 /*@C
2052   MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()`
2053 
2054   Not Collective
2055 
2056   Input Parameters:
2057 +  A - a `MATSEQSELL` matrix
2058 -  array - pointer to the data
2059 
2060   Level: intermediate
2061 
2062 .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
2063 @*/
2064 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array)
2065 {
2066   PetscFunctionBegin;
2067   PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
2068   PetscFunctionReturn(PETSC_SUCCESS);
2069 }
2070 
2071 /*@C
2072   MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix.
2073 
2074   Not Collective
2075 
2076   Input Parameter:
2077 .  A - a MATSEQSELL matrix
2078 
2079   Output Parameter:
2080 .  ratio - ratio of number of padded zeros to number of allocated elements
2081 
2082   Level: intermediate
2083 @*/
2084 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio)
2085 {
2086   PetscFunctionBegin;
2087   PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio));
2088   PetscFunctionReturn(PETSC_SUCCESS);
2089 }
2090 
2091 /*@C
2092   MatSeqSELLGetMaxSliceWidth - returns the maximum slice width.
2093 
2094   Not Collective
2095 
2096   Input Parameter:
2097 .  A - a MATSEQSELL matrix
2098 
2099   Output Parameter:
2100 .  slicewidth - maximum slice width
2101 
2102  Level: intermediate
2103 @*/
2104 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth)
2105 {
2106   PetscFunctionBegin;
2107   PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth));
2108   PetscFunctionReturn(PETSC_SUCCESS);
2109 }
2110 
2111 /*@C
2112   MatSeqSELLGetAvgSliceWidth - returns the average slice width.
2113 
2114   Not Collective
2115 
2116   Input Parameter:
2117 .  A - a MATSEQSELL matrix
2118 
2119   Output Parameter:
2120 .  slicewidth - average slice width
2121 
2122   Level: intermediate
2123 @*/
2124 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth)
2125 {
2126   PetscFunctionBegin;
2127   PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth));
2128   PetscFunctionReturn(PETSC_SUCCESS);
2129 }
2130 
2131 /*@C
2132   MatSeqSELLSetSliceHeight - sets the slice height.
2133 
2134   Not Collective
2135 
2136   Input Parameters:
2137 +  A - a MATSEQSELL matrix
2138 -  sliceheight - slice height
2139 
2140   Notes:
2141   You cannot change the slice height once it have been set.
2142 
2143   The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called.
2144 
2145   Level: intermediate
2146 @*/
2147 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight)
2148 {
2149   PetscFunctionBegin;
2150   PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight));
2151   PetscFunctionReturn(PETSC_SUCCESS);
2152 }
2153 
2154 /*@C
2155   MatSeqSELLGetVarSliceSize - returns the variance of the slice size.
2156 
2157   Not Collective
2158 
2159   Input Parameter:
2160 .  A - a MATSEQSELL matrix
2161 
2162   Output Parameter:
2163 .  variance - variance of the slice size
2164 
2165   Level: intermediate
2166 @*/
2167 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance)
2168 {
2169   PetscFunctionBegin;
2170   PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance));
2171   PetscFunctionReturn(PETSC_SUCCESS);
2172 }
2173 
2174 #if defined(PETSC_HAVE_CUDA)
2175 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
2176 #endif
2177 
2178 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
2179 {
2180   Mat_SeqSELL *b;
2181   PetscMPIInt  size;
2182 
2183   PetscFunctionBegin;
2184   PetscCall(PetscCitationsRegister(citation, &cited));
2185   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2186   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
2187 
2188   PetscCall(PetscNew(&b));
2189 
2190   B->data = (void *)b;
2191 
2192   PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps)));
2193 
2194   b->row                = NULL;
2195   b->col                = NULL;
2196   b->icol               = NULL;
2197   b->reallocs           = 0;
2198   b->ignorezeroentries  = PETSC_FALSE;
2199   b->roworiented        = PETSC_TRUE;
2200   b->nonew              = 0;
2201   b->diag               = NULL;
2202   b->solve_work         = NULL;
2203   B->spptr              = NULL;
2204   b->saved_values       = NULL;
2205   b->idiag              = NULL;
2206   b->mdiag              = NULL;
2207   b->ssor_work          = NULL;
2208   b->omega              = 1.0;
2209   b->fshift             = 0.0;
2210   b->idiagvalid         = PETSC_FALSE;
2211   b->keepnonzeropattern = PETSC_FALSE;
2212   b->sliceheight        = 0;
2213 
2214   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
2215   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
2216   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
2217   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
2218   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
2219   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
2220   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_SeqSELL_SeqAIJ_C", MatConvert_SeqSELL_SeqAIJ));
2221 #if defined(PETSC_HAVE_CUDA)
2222   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_SeqSELL_SeqSELLCUDA_C", MatConvert_SeqSELL_SeqSELLCUDA));
2223 #endif
2224   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL));
2225   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL));
2226   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL));
2227   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL));
2228 
2229   PetscObjectOptionsBegin((PetscObject)B);
2230   {
2231     PetscInt  newsh = -1;
2232     PetscBool flg;
2233 #if defined(PETSC_HAVE_CUDA)
2234     PetscInt chunksize = 0;
2235 #endif
2236 
2237     PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg));
2238     if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); }
2239 #if defined(PETSC_HAVE_CUDA)
2240     PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg));
2241     if (flg) {
2242       PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize);
2243       b->chunksize = chunksize;
2244     }
2245 #endif
2246   }
2247   PetscOptionsEnd();
2248   PetscFunctionReturn(PETSC_SUCCESS);
2249 }
2250 
2251 /*
2252  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2253  */
2254 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
2255 {
2256   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
2257   PetscInt     i, m                           = A->rmap->n;
2258   PetscInt     totalslices = a->totalslices;
2259 
2260   PetscFunctionBegin;
2261   C->factortype = A->factortype;
2262   c->row        = NULL;
2263   c->col        = NULL;
2264   c->icol       = NULL;
2265   c->reallocs   = 0;
2266   C->assembled  = PETSC_TRUE;
2267 
2268   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2269   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2270 
2271   PetscCall(PetscMalloc1(a->sliceheight * totalslices, &c->rlen));
2272   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2273 
2274   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2275   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2276 
2277   /* allocate the matrix space */
2278   if (mallocmatspace) {
2279     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2280 
2281     c->singlemalloc = PETSC_TRUE;
2282 
2283     if (m > 0) {
2284       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2285       if (cpvalues == MAT_COPY_VALUES) {
2286         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2287       } else {
2288         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2289       }
2290     }
2291   }
2292 
2293   c->ignorezeroentries = a->ignorezeroentries;
2294   c->roworiented       = a->roworiented;
2295   c->nonew             = a->nonew;
2296   if (a->diag) {
2297     PetscCall(PetscMalloc1(m, &c->diag));
2298     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2299   } else c->diag = NULL;
2300 
2301   c->solve_work         = NULL;
2302   c->saved_values       = NULL;
2303   c->idiag              = NULL;
2304   c->ssor_work          = NULL;
2305   c->keepnonzeropattern = a->keepnonzeropattern;
2306   c->free_val           = PETSC_TRUE;
2307   c->free_colidx        = PETSC_TRUE;
2308 
2309   c->maxallocmat  = a->maxallocmat;
2310   c->maxallocrow  = a->maxallocrow;
2311   c->rlenmax      = a->rlenmax;
2312   c->nz           = a->nz;
2313   C->preallocated = PETSC_TRUE;
2314 
2315   c->nonzerorowcnt = a->nonzerorowcnt;
2316   C->nonzerostate  = A->nonzerostate;
2317 
2318   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2319   PetscFunctionReturn(PETSC_SUCCESS);
2320 }
2321 
2322 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2323 {
2324   PetscFunctionBegin;
2325   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2326   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2327   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2328   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2329   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2330   PetscFunctionReturn(PETSC_SUCCESS);
2331 }
2332 
2333 /*MC
2334    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2335    based on the sliced Ellpack format
2336 
2337    Options Database Key:
2338 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2339 
2340    Level: beginner
2341 
2342 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2343 M*/
2344 
2345 /*MC
2346    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2347 
2348    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2349    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2350   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2351   for communicators controlling multiple processes.  It is recommended that you call both of
2352   the above preallocation routines for simplicity.
2353 
2354    Options Database Key:
2355 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2356 
2357   Level: beginner
2358 
2359   Notes:
2360    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2361 
2362    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2363    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2364 
2365   Developer Notes:
2366    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2367 
2368    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2369 .vb
2370                             (2 0  3 4)
2371    Consider the matrix A =  (5 0  6 0)
2372                             (0 0  7 8)
2373                             (0 0  9 9)
2374 
2375    symbolically the Ellpack format can be written as
2376 
2377         (2 3 4 |)           (0 2 3 |)
2378    v =  (5 6 0 |)  colidx = (0 2 2 |)
2379         --------            ---------
2380         (7 8 |)             (2 3 |)
2381         (9 9 |)             (2 3 |)
2382 
2383     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2384     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2385     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2386 
2387     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2388 
2389 .ve
2390 
2391       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2392 
2393  References:
2394 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2395    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2396 
2397 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2398 M*/
2399 
2400 /*@C
2401        MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2402 
2403  Collective
2404 
2405  Input Parameters:
2406 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
2407 .  m - number of rows
2408 .  n - number of columns
2409 .  rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2410 -  rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2411 
2412  Output Parameter:
2413 .  A - the matrix
2414 
2415  Level: intermediate
2416 
2417  Notes:
2418  It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2419  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2420  [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2421 
2422  Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2423  Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2424  allocation.
2425 
2426  .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2427  @*/
2428 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2429 {
2430   PetscFunctionBegin;
2431   PetscCall(MatCreate(comm, A));
2432   PetscCall(MatSetSizes(*A, m, n, m, n));
2433   PetscCall(MatSetType(*A, MATSEQSELL));
2434   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2435   PetscFunctionReturn(PETSC_SUCCESS);
2436 }
2437 
2438 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2439 {
2440   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2441   PetscInt     totalslices = a->totalslices;
2442 
2443   PetscFunctionBegin;
2444   /* If the  matrix dimensions are not equal,or no of nonzeros */
2445   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2446     *flg = PETSC_FALSE;
2447     PetscFunctionReturn(PETSC_SUCCESS);
2448   }
2449   /* if the a->colidx are the same */
2450   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2451   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2452   /* if a->val are the same */
2453   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2454   PetscFunctionReturn(PETSC_SUCCESS);
2455 }
2456 
2457 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2458 {
2459   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2460 
2461   PetscFunctionBegin;
2462   a->idiagvalid = PETSC_FALSE;
2463   PetscFunctionReturn(PETSC_SUCCESS);
2464 }
2465 
2466 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2467 {
2468 #if defined(PETSC_USE_COMPLEX)
2469   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2470   PetscInt     i;
2471   PetscScalar *val = a->val;
2472 
2473   PetscFunctionBegin;
2474   for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); }
2475   #if defined(PETSC_HAVE_CUDA)
2476   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
2477   #endif
2478 #else
2479   PetscFunctionBegin;
2480 #endif
2481   PetscFunctionReturn(PETSC_SUCCESS);
2482 }
2483