xref: /petsc/src/mat/impls/sell/seq/sell.c (revision f4f49eeac7efa77fffa46b7ff95a3ed169f659ed)
1 /*
2   Defines the basic matrix operations for the SELL matrix storage format.
3 */
4 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
5 #include <petscblaslapack.h>
6 #include <petsc/private/kernels/blocktranspose.h>
7 
8 static PetscBool  cited      = PETSC_FALSE;
9 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
10                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
11                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
12                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
13                                " year = 2018\n"
14                                "}\n";
15 
16 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
17 
18   #include <immintrin.h>
19 
20   #if !defined(_MM_SCALE_8)
21     #define _MM_SCALE_8 8
22   #endif
23 
24   #if defined(__AVX512F__)
25     /* these do not work
26    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
27    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
28   */
29     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
30       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
31       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
32       vec_vals = _mm512_loadu_pd(aval); \
33       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
34       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
35   #elif defined(__AVX2__) && defined(__FMA__)
36     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
37       vec_vals = _mm256_loadu_pd(aval); \
38       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
39       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
40       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
41   #endif
42 #endif /* PETSC_HAVE_IMMINTRIN_H */
43 
44 /*@C
45   MatSeqSELLSetPreallocation - For good matrix assembly performance
46   the user should preallocate the matrix storage by setting the parameter `nz`
47   (or the array `nnz`).
48 
49   Collective
50 
51   Input Parameters:
52 + B       - The `MATSEQSELL` matrix
53 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
54 - rlen    - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
55 
56   Level: intermediate
57 
58   Notes:
59   Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
60   Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
61   allocation.
62 
63   You can call `MatGetInfo()` to get information on how effective the preallocation was;
64   for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
65   You can also run with the option `-info` and look for messages with the string
66   malloc in them to see if additional memory allocation was needed.
67 
68   Developer Notes:
69   Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
70   entries or columns indices.
71 
72   The maximum number of nonzeos in any row should be as accurate as possible.
73   If it is underestimated, you will get bad performance due to reallocation
74   (`MatSeqXSELLReallocateSELL()`).
75 
76 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
77  @*/
78 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
79 {
80   PetscFunctionBegin;
81   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
82   PetscValidType(B, 1);
83   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
84   PetscFunctionReturn(PETSC_SUCCESS);
85 }
86 
87 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
88 {
89   Mat_SeqSELL *b;
90   PetscInt     i, j, totalslices;
91 #if defined(PETSC_HAVE_CUDA)
92   PetscInt rlenmax = 0;
93 #endif
94   PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
95 
96   PetscFunctionBegin;
97   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
98   if (maxallocrow == MAT_SKIP_ALLOCATION) {
99     skipallocation = PETSC_TRUE;
100     maxallocrow    = 0;
101   }
102 
103   PetscCall(PetscLayoutSetUp(B->rmap));
104   PetscCall(PetscLayoutSetUp(B->cmap));
105 
106   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
107   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
108   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
109   if (rlen) {
110     for (i = 0; i < B->rmap->n; i++) {
111       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
112       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
113     }
114   }
115 
116   B->preallocated = PETSC_TRUE;
117 
118   b = (Mat_SeqSELL *)B->data;
119 
120   if (!b->sliceheight) { /* not set yet */
121 #if defined(PETSC_HAVE_CUDA)
122     b->sliceheight = 16;
123 #else
124     b->sliceheight = 8;
125 #endif
126   }
127   totalslices    = PetscCeilInt(B->rmap->n, b->sliceheight);
128   b->totalslices = totalslices;
129   if (!skipallocation) {
130     if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n));
131 
132     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
133       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
134     }
135     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
136       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
137       else if (maxallocrow < 0) maxallocrow = 1;
138 #if defined(PETSC_HAVE_CUDA)
139       rlenmax = maxallocrow;
140       /* Pad the slice to DEVICE_MEM_ALIGN */
141       while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++;
142 #endif
143       for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow;
144     } else {
145 #if defined(PETSC_HAVE_CUDA)
146       PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight;
147 #endif
148       maxallocrow  = 0;
149       b->sliidx[0] = 0;
150       for (i = 1; i < totalslices; i++) {
151         b->sliidx[i] = 0;
152         for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); }
153 #if defined(PETSC_HAVE_CUDA)
154         rlenmax = PetscMax(b->sliidx[i], rlenmax);
155         /* Pad the slice to DEVICE_MEM_ALIGN */
156         b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul;
157 #endif
158         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
159         PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i]));
160       }
161       /* last slice */
162       b->sliidx[totalslices] = 0;
163       for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
164 #if defined(PETSC_HAVE_CUDA)
165       rlenmax                = PetscMax(b->sliidx[i], rlenmax);
166       b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul;
167 #endif
168       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
169       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices];
170     }
171 
172     /* allocate space for val, colidx, rlen */
173     /* FIXME: should B's old memory be unlogged? */
174     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
175     /* FIXME: assuming an element of the bit array takes 8 bits */
176     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
177     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
178     PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen));
179 
180     b->singlemalloc = PETSC_TRUE;
181     b->free_val     = PETSC_TRUE;
182     b->free_colidx  = PETSC_TRUE;
183   } else {
184     b->free_val    = PETSC_FALSE;
185     b->free_colidx = PETSC_FALSE;
186   }
187 
188   b->nz          = 0;
189   b->maxallocrow = maxallocrow;
190 #if defined(PETSC_HAVE_CUDA)
191   b->rlenmax = rlenmax;
192 #else
193   b->rlenmax = maxallocrow;
194 #endif
195   b->maxallocmat      = b->sliidx[totalslices];
196   B->info.nz_unneeded = (double)b->maxallocmat;
197   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
198   PetscFunctionReturn(PETSC_SUCCESS);
199 }
200 
201 static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
202 {
203   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
204   PetscInt     shift;
205 
206   PetscFunctionBegin;
207   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
208   if (nz) *nz = a->rlen[row];
209   shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight);
210   if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); }
211   if (idx) {
212     PetscInt j;
213     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j];
214     *idx = a->getrowcols;
215   }
216   if (v) {
217     PetscInt j;
218     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j];
219     *v = a->getrowvals;
220   }
221   PetscFunctionReturn(PETSC_SUCCESS);
222 }
223 
224 static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
225 {
226   PetscFunctionBegin;
227   PetscFunctionReturn(PETSC_SUCCESS);
228 }
229 
230 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
231 {
232   Mat          B;
233   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
234   PetscInt     i;
235 
236   PetscFunctionBegin;
237   if (reuse == MAT_REUSE_MATRIX) {
238     B = *newmat;
239     PetscCall(MatZeroEntries(B));
240   } else {
241     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
242     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
243     PetscCall(MatSetType(B, MATSEQAIJ));
244     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
245   }
246 
247   for (i = 0; i < A->rmap->n; i++) {
248     PetscInt     nz = 0, *cols = NULL;
249     PetscScalar *vals = NULL;
250 
251     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
252     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
253     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
254   }
255 
256   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
257   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
258   B->rmap->bs = A->rmap->bs;
259 
260   if (reuse == MAT_INPLACE_MATRIX) {
261     PetscCall(MatHeaderReplace(A, &B));
262   } else {
263     *newmat = B;
264   }
265   PetscFunctionReturn(PETSC_SUCCESS);
266 }
267 
268 #include <../src/mat/impls/aij/seq/aij.h>
269 
270 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
271 {
272   Mat                B;
273   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
274   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
275   const PetscInt    *cols;
276   const PetscScalar *vals;
277 
278   PetscFunctionBegin;
279   if (reuse == MAT_REUSE_MATRIX) {
280     B = *newmat;
281   } else {
282     if (PetscDefined(USE_DEBUG) || !a->ilen) {
283       PetscCall(PetscMalloc1(m, &rowlengths));
284       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
285     }
286     if (PetscDefined(USE_DEBUG) && a->ilen) {
287       PetscBool eq;
288       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
289       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
290       PetscCall(PetscFree(rowlengths));
291       rowlengths = a->ilen;
292     } else if (a->ilen) rowlengths = a->ilen;
293     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
294     PetscCall(MatSetSizes(B, m, n, m, n));
295     PetscCall(MatSetType(B, MATSEQSELL));
296     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
297     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
298   }
299 
300   for (row = 0; row < m; row++) {
301     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
302     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
303     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
304   }
305   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
306   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
307   B->rmap->bs = A->rmap->bs;
308 
309   if (reuse == MAT_INPLACE_MATRIX) {
310     PetscCall(MatHeaderReplace(A, &B));
311   } else {
312     *newmat = B;
313   }
314   PetscFunctionReturn(PETSC_SUCCESS);
315 }
316 
317 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
318 {
319   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
320   PetscScalar       *y;
321   const PetscScalar *x;
322   const MatScalar   *aval        = a->val;
323   PetscInt           totalslices = a->totalslices;
324   const PetscInt    *acolidx     = a->colidx;
325   PetscInt           i, j;
326 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
327   __m512d  vec_x, vec_y, vec_vals;
328   __m256i  vec_idx;
329   __mmask8 mask;
330   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
331   __m256i  vec_idx2, vec_idx3, vec_idx4;
332 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
333   __m128i   vec_idx;
334   __m256d   vec_x, vec_y, vec_y2, vec_vals;
335   MatScalar yval;
336   PetscInt  r, rows_left, row, nnz_in_row;
337 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
338   __m128d   vec_x_tmp;
339   __m256d   vec_x, vec_y, vec_y2, vec_vals;
340   MatScalar yval;
341   PetscInt  r, rows_left, row, nnz_in_row;
342 #else
343   PetscInt     k, sliceheight = a->sliceheight;
344   PetscScalar *sum;
345 #endif
346 
347 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
348   #pragma disjoint(*x, *y, *aval)
349 #endif
350 
351   PetscFunctionBegin;
352   PetscCall(VecGetArrayRead(xx, &x));
353   PetscCall(VecGetArray(yy, &y));
354 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
355   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
356   for (i = 0; i < totalslices; i++) { /* loop over slices */
357     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
358     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
359 
360     vec_y  = _mm512_setzero_pd();
361     vec_y2 = _mm512_setzero_pd();
362     vec_y3 = _mm512_setzero_pd();
363     vec_y4 = _mm512_setzero_pd();
364 
365     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
366     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
367     case 3:
368       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
369       acolidx += 8;
370       aval += 8;
371       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
372       acolidx += 8;
373       aval += 8;
374       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
375       acolidx += 8;
376       aval += 8;
377       j += 3;
378       break;
379     case 2:
380       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
381       acolidx += 8;
382       aval += 8;
383       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
384       acolidx += 8;
385       aval += 8;
386       j += 2;
387       break;
388     case 1:
389       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
390       acolidx += 8;
391       aval += 8;
392       j += 1;
393       break;
394     }
395   #pragma novector
396     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
397       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
398       acolidx += 8;
399       aval += 8;
400       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
401       acolidx += 8;
402       aval += 8;
403       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
404       acolidx += 8;
405       aval += 8;
406       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
407       acolidx += 8;
408       aval += 8;
409     }
410 
411     vec_y = _mm512_add_pd(vec_y, vec_y2);
412     vec_y = _mm512_add_pd(vec_y, vec_y3);
413     vec_y = _mm512_add_pd(vec_y, vec_y4);
414     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
415       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
416       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
417     } else {
418       _mm512_storeu_pd(&y[8 * i], vec_y);
419     }
420   }
421 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
422   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
423   for (i = 0; i < totalslices; i++) { /* loop over full slices */
424     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
425     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
426 
427     /* last slice may have padding rows. Don't use vectorization. */
428     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
429       rows_left = A->rmap->n - 8 * i;
430       for (r = 0; r < rows_left; ++r) {
431         yval       = (MatScalar)0;
432         row        = 8 * i + r;
433         nnz_in_row = a->rlen[row];
434         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
435         y[row] = yval;
436       }
437       break;
438     }
439 
440     vec_y  = _mm256_setzero_pd();
441     vec_y2 = _mm256_setzero_pd();
442 
443   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
444   #pragma novector
445   #pragma unroll(2)
446     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
447       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
448       aval += 4;
449       acolidx += 4;
450       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
451       aval += 4;
452       acolidx += 4;
453     }
454 
455     _mm256_storeu_pd(y + i * 8, vec_y);
456     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
457   }
458 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
459   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
460   for (i = 0; i < totalslices; i++) { /* loop over full slices */
461     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
462     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
463 
464     vec_y  = _mm256_setzero_pd();
465     vec_y2 = _mm256_setzero_pd();
466 
467     /* last slice may have padding rows. Don't use vectorization. */
468     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
469       rows_left = A->rmap->n - 8 * i;
470       for (r = 0; r < rows_left; ++r) {
471         yval       = (MatScalar)0;
472         row        = 8 * i + r;
473         nnz_in_row = a->rlen[row];
474         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
475         y[row] = yval;
476       }
477       break;
478     }
479 
480   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
481   #pragma novector
482   #pragma unroll(2)
483     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
484       vec_vals  = _mm256_loadu_pd(aval);
485       vec_x_tmp = _mm_setzero_pd();
486       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
487       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
488       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
489       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
490       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
491       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
492       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
493       aval += 4;
494 
495       vec_vals  = _mm256_loadu_pd(aval);
496       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
497       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
498       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
499       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
500       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
501       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
502       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
503       aval += 4;
504     }
505 
506     _mm256_storeu_pd(y + i * 8, vec_y);
507     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
508   }
509 #else
510   PetscCall(PetscMalloc1(sliceheight, &sum));
511   for (i = 0; i < totalslices; i++) { /* loop over slices */
512     for (j = 0; j < sliceheight; j++) {
513       sum[j] = 0.0;
514       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
515     }
516     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */
517       for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j];
518     } else {
519       for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j];
520     }
521   }
522   PetscCall(PetscFree(sum));
523 #endif
524 
525   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
526   PetscCall(VecRestoreArrayRead(xx, &x));
527   PetscCall(VecRestoreArray(yy, &y));
528   PetscFunctionReturn(PETSC_SUCCESS);
529 }
530 
531 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
532 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
533 {
534   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
535   PetscScalar       *y, *z;
536   const PetscScalar *x;
537   const MatScalar   *aval        = a->val;
538   PetscInt           totalslices = a->totalslices;
539   const PetscInt    *acolidx     = a->colidx;
540   PetscInt           i, j;
541 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
542   __m512d  vec_x, vec_y, vec_vals;
543   __m256i  vec_idx;
544   __mmask8 mask = 0;
545   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
546   __m256i  vec_idx2, vec_idx3, vec_idx4;
547 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
548   __m128d   vec_x_tmp;
549   __m256d   vec_x, vec_y, vec_y2, vec_vals;
550   MatScalar yval;
551   PetscInt  r, row, nnz_in_row;
552 #else
553   PetscInt     k, sliceheight = a->sliceheight;
554   PetscScalar *sum;
555 #endif
556 
557 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
558   #pragma disjoint(*x, *y, *aval)
559 #endif
560 
561   PetscFunctionBegin;
562   if (!a->nz) {
563     PetscCall(VecCopy(yy, zz));
564     PetscFunctionReturn(PETSC_SUCCESS);
565   }
566   PetscCall(VecGetArrayRead(xx, &x));
567   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
568 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
569   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
570   for (i = 0; i < totalslices; i++) { /* loop over slices */
571     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
572     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
573 
574     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
575       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
576       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
577     } else {
578       vec_y = _mm512_loadu_pd(&y[8 * i]);
579     }
580     vec_y2 = _mm512_setzero_pd();
581     vec_y3 = _mm512_setzero_pd();
582     vec_y4 = _mm512_setzero_pd();
583 
584     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
585     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
586     case 3:
587       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
588       acolidx += 8;
589       aval += 8;
590       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
591       acolidx += 8;
592       aval += 8;
593       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
594       acolidx += 8;
595       aval += 8;
596       j += 3;
597       break;
598     case 2:
599       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
600       acolidx += 8;
601       aval += 8;
602       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
603       acolidx += 8;
604       aval += 8;
605       j += 2;
606       break;
607     case 1:
608       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
609       acolidx += 8;
610       aval += 8;
611       j += 1;
612       break;
613     }
614   #pragma novector
615     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
616       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
617       acolidx += 8;
618       aval += 8;
619       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
620       acolidx += 8;
621       aval += 8;
622       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
623       acolidx += 8;
624       aval += 8;
625       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
626       acolidx += 8;
627       aval += 8;
628     }
629 
630     vec_y = _mm512_add_pd(vec_y, vec_y2);
631     vec_y = _mm512_add_pd(vec_y, vec_y3);
632     vec_y = _mm512_add_pd(vec_y, vec_y4);
633     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
634       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
635     } else {
636       _mm512_storeu_pd(&z[8 * i], vec_y);
637     }
638   }
639 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
640   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
641   for (i = 0; i < totalslices; i++) { /* loop over full slices */
642     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
643     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
644 
645     /* last slice may have padding rows. Don't use vectorization. */
646     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
647       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
648         row        = 8 * i + r;
649         yval       = (MatScalar)0.0;
650         nnz_in_row = a->rlen[row];
651         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
652         z[row] = y[row] + yval;
653       }
654       break;
655     }
656 
657     vec_y  = _mm256_loadu_pd(y + 8 * i);
658     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
659 
660     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
661     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
662       vec_vals  = _mm256_loadu_pd(aval);
663       vec_x_tmp = _mm_setzero_pd();
664       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
665       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
666       vec_x     = _mm256_setzero_pd();
667       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
668       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
669       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
670       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
671       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
672       aval += 4;
673 
674       vec_vals  = _mm256_loadu_pd(aval);
675       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
676       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
677       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
678       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
679       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
680       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
681       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
682       aval += 4;
683     }
684 
685     _mm256_storeu_pd(z + i * 8, vec_y);
686     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
687   }
688 #else
689   PetscCall(PetscMalloc1(sliceheight, &sum));
690   for (i = 0; i < totalslices; i++) { /* loop over slices */
691     for (j = 0; j < sliceheight; j++) {
692       sum[j] = 0.0;
693       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
694     }
695     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
696       for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
697     } else {
698       for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
699     }
700   }
701   PetscCall(PetscFree(sum));
702 #endif
703 
704   PetscCall(PetscLogFlops(2.0 * a->nz));
705   PetscCall(VecRestoreArrayRead(xx, &x));
706   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
707   PetscFunctionReturn(PETSC_SUCCESS);
708 }
709 
710 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
711 {
712   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
713   PetscScalar       *y;
714   const PetscScalar *x;
715   const MatScalar   *aval    = a->val;
716   const PetscInt    *acolidx = a->colidx;
717   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight;
718 
719 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
720   #pragma disjoint(*x, *y, *aval)
721 #endif
722 
723   PetscFunctionBegin;
724   if (A->symmetric == PETSC_BOOL3_TRUE) {
725     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
726     PetscFunctionReturn(PETSC_SUCCESS);
727   }
728   if (zz != yy) PetscCall(VecCopy(zz, yy));
729 
730   if (a->nz) {
731     PetscCall(VecGetArrayRead(xx, &x));
732     PetscCall(VecGetArray(yy, &y));
733     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
734       if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
735         for (r = 0; r < (A->rmap->n % sliceheight); ++r) {
736           row        = sliceheight * i + r;
737           nnz_in_row = a->rlen[row];
738           for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row];
739         }
740         break;
741       }
742       for (r = 0; r < sliceheight; ++r)
743         for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r];
744     }
745     PetscCall(PetscLogFlops(2.0 * a->nz));
746     PetscCall(VecRestoreArrayRead(xx, &x));
747     PetscCall(VecRestoreArray(yy, &y));
748   }
749   PetscFunctionReturn(PETSC_SUCCESS);
750 }
751 
752 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
753 {
754   PetscFunctionBegin;
755   if (A->symmetric == PETSC_BOOL3_TRUE) {
756     PetscCall(MatMult_SeqSELL(A, xx, yy));
757   } else {
758     PetscCall(VecSet(yy, 0.0));
759     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
760   }
761   PetscFunctionReturn(PETSC_SUCCESS);
762 }
763 
764 /*
765      Checks for missing diagonals
766 */
767 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
768 {
769   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
770   PetscInt    *diag, i;
771 
772   PetscFunctionBegin;
773   *missing = PETSC_FALSE;
774   if (A->rmap->n > 0 && !a->colidx) {
775     *missing = PETSC_TRUE;
776     if (d) *d = 0;
777     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
778   } else {
779     diag = a->diag;
780     for (i = 0; i < A->rmap->n; i++) {
781       if (diag[i] == -1) {
782         *missing = PETSC_TRUE;
783         if (d) *d = i;
784         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
785         break;
786       }
787     }
788   }
789   PetscFunctionReturn(PETSC_SUCCESS);
790 }
791 
792 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
793 {
794   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
795   PetscInt     i, j, m = A->rmap->n, shift;
796 
797   PetscFunctionBegin;
798   if (!a->diag) {
799     PetscCall(PetscMalloc1(m, &a->diag));
800     a->free_diag = PETSC_TRUE;
801   }
802   for (i = 0; i < m; i++) {                                          /* loop over rows */
803     shift      = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
804     a->diag[i] = -1;
805     for (j = 0; j < a->rlen[i]; j++) {
806       if (a->colidx[shift + a->sliceheight * j] == i) {
807         a->diag[i] = shift + a->sliceheight * j;
808         break;
809       }
810     }
811   }
812   PetscFunctionReturn(PETSC_SUCCESS);
813 }
814 
815 /*
816   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
817 */
818 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
819 {
820   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
821   PetscInt     i, *diag, m = A->rmap->n;
822   MatScalar   *val = a->val;
823   PetscScalar *idiag, *mdiag;
824 
825   PetscFunctionBegin;
826   if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS);
827   PetscCall(MatMarkDiagonal_SeqSELL(A));
828   diag = a->diag;
829   if (!a->idiag) {
830     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
831     val = a->val;
832   }
833   mdiag = a->mdiag;
834   idiag = a->idiag;
835 
836   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
837     for (i = 0; i < m; i++) {
838       mdiag[i] = val[diag[i]];
839       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
840         PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
841         PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
842         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
843         A->factorerror_zeropivot_value = 0.0;
844         A->factorerror_zeropivot_row   = i;
845       }
846       idiag[i] = 1.0 / val[diag[i]];
847     }
848     PetscCall(PetscLogFlops(m));
849   } else {
850     for (i = 0; i < m; i++) {
851       mdiag[i] = val[diag[i]];
852       idiag[i] = omega / (fshift + val[diag[i]]);
853     }
854     PetscCall(PetscLogFlops(2.0 * m));
855   }
856   a->idiagvalid = PETSC_TRUE;
857   PetscFunctionReturn(PETSC_SUCCESS);
858 }
859 
860 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
861 {
862   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
863 
864   PetscFunctionBegin;
865   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
866   PetscCall(MatSeqSELLInvalidateDiagonal(A));
867   PetscFunctionReturn(PETSC_SUCCESS);
868 }
869 
870 PetscErrorCode MatDestroy_SeqSELL(Mat A)
871 {
872   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
873 
874   PetscFunctionBegin;
875   PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
876   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
877   PetscCall(ISDestroy(&a->row));
878   PetscCall(ISDestroy(&a->col));
879   PetscCall(PetscFree(a->diag));
880   PetscCall(PetscFree(a->rlen));
881   PetscCall(PetscFree(a->sliidx));
882   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
883   PetscCall(PetscFree(a->solve_work));
884   PetscCall(ISDestroy(&a->icol));
885   PetscCall(PetscFree(a->saved_values));
886   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
887   PetscCall(PetscFree(A->data));
888 #if defined(PETSC_HAVE_CUDA)
889   PetscCall(PetscFree(a->chunk_slice_map));
890 #endif
891 
892   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
893   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
894   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
895   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
896   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
897   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
898   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
899 #if defined(PETSC_HAVE_CUDA)
900   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL));
901 #endif
902   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL));
903   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL));
904   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL));
905   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL));
906   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL));
907   PetscFunctionReturn(PETSC_SUCCESS);
908 }
909 
910 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
911 {
912   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
913 
914   PetscFunctionBegin;
915   switch (op) {
916   case MAT_ROW_ORIENTED:
917     a->roworiented = flg;
918     break;
919   case MAT_KEEP_NONZERO_PATTERN:
920     a->keepnonzeropattern = flg;
921     break;
922   case MAT_NEW_NONZERO_LOCATIONS:
923     a->nonew = (flg ? 0 : 1);
924     break;
925   case MAT_NEW_NONZERO_LOCATION_ERR:
926     a->nonew = (flg ? -1 : 0);
927     break;
928   case MAT_NEW_NONZERO_ALLOCATION_ERR:
929     a->nonew = (flg ? -2 : 0);
930     break;
931   case MAT_UNUSED_NONZERO_LOCATION_ERR:
932     a->nounused = (flg ? -1 : 0);
933     break;
934   case MAT_FORCE_DIAGONAL_ENTRIES:
935   case MAT_IGNORE_OFF_PROC_ENTRIES:
936   case MAT_USE_HASH_TABLE:
937   case MAT_SORTED_FULL:
938     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
939     break;
940   case MAT_SPD:
941   case MAT_SYMMETRIC:
942   case MAT_STRUCTURALLY_SYMMETRIC:
943   case MAT_HERMITIAN:
944   case MAT_SYMMETRY_ETERNAL:
945   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
946   case MAT_SPD_ETERNAL:
947     /* These options are handled directly by MatSetOption() */
948     break;
949   default:
950     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
951   }
952   PetscFunctionReturn(PETSC_SUCCESS);
953 }
954 
955 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
956 {
957   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
958   PetscInt     i, j, n, shift;
959   PetscScalar *x, zero = 0.0;
960 
961   PetscFunctionBegin;
962   PetscCall(VecGetLocalSize(v, &n));
963   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
964 
965   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
966     PetscInt *diag = a->diag;
967     PetscCall(VecGetArray(v, &x));
968     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
969     PetscCall(VecRestoreArray(v, &x));
970     PetscFunctionReturn(PETSC_SUCCESS);
971   }
972 
973   PetscCall(VecSet(v, zero));
974   PetscCall(VecGetArray(v, &x));
975   for (i = 0; i < n; i++) {                                     /* loop over rows */
976     shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
977     x[i]  = 0;
978     for (j = 0; j < a->rlen[i]; j++) {
979       if (a->colidx[shift + a->sliceheight * j] == i) {
980         x[i] = a->val[shift + a->sliceheight * j];
981         break;
982       }
983     }
984   }
985   PetscCall(VecRestoreArray(v, &x));
986   PetscFunctionReturn(PETSC_SUCCESS);
987 }
988 
989 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
990 {
991   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
992   const PetscScalar *l, *r;
993   PetscInt           i, j, m, n, row;
994 
995   PetscFunctionBegin;
996   if (ll) {
997     /* The local size is used so that VecMPI can be passed to this routine
998        by MatDiagonalScale_MPISELL */
999     PetscCall(VecGetLocalSize(ll, &m));
1000     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1001     PetscCall(VecGetArrayRead(ll, &l));
1002     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1003       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1004         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1005           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row];
1006         }
1007       } else {
1008         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; }
1009       }
1010     }
1011     PetscCall(VecRestoreArrayRead(ll, &l));
1012     PetscCall(PetscLogFlops(a->nz));
1013   }
1014   if (rr) {
1015     PetscCall(VecGetLocalSize(rr, &n));
1016     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1017     PetscCall(VecGetArrayRead(rr, &r));
1018     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1019       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1020         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) {
1021           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]];
1022         }
1023       } else {
1024         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
1025       }
1026     }
1027     PetscCall(VecRestoreArrayRead(rr, &r));
1028     PetscCall(PetscLogFlops(a->nz));
1029   }
1030   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1031 #if defined(PETSC_HAVE_CUDA)
1032   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1033 #endif
1034   PetscFunctionReturn(PETSC_SUCCESS);
1035 }
1036 
1037 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1038 {
1039   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1040   PetscInt    *cp, i, k, low, high, t, row, col, l;
1041   PetscInt     shift;
1042   MatScalar   *vp;
1043 
1044   PetscFunctionBegin;
1045   for (k = 0; k < m; k++) { /* loop over requested rows */
1046     row = im[k];
1047     if (row < 0) continue;
1048     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1049     shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */
1050     cp    = a->colidx + shift;                                        /* pointer to the row */
1051     vp    = a->val + shift;                                           /* pointer to the row */
1052     for (l = 0; l < n; l++) {                                         /* loop over requested columns */
1053       col = in[l];
1054       if (col < 0) continue;
1055       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1056       high = a->rlen[row];
1057       low  = 0; /* assume unsorted */
1058       while (high - low > 5) {
1059         t = (low + high) / 2;
1060         if (*(cp + a->sliceheight * t) > col) high = t;
1061         else low = t;
1062       }
1063       for (i = low; i < high; i++) {
1064         if (*(cp + a->sliceheight * i) > col) break;
1065         if (*(cp + a->sliceheight * i) == col) {
1066           *v++ = *(vp + a->sliceheight * i);
1067           goto finished;
1068         }
1069       }
1070       *v++ = 0.0;
1071     finished:;
1072     }
1073   }
1074   PetscFunctionReturn(PETSC_SUCCESS);
1075 }
1076 
1077 static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1078 {
1079   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1080   PetscInt          i, j, m = A->rmap->n, shift;
1081   const char       *name;
1082   PetscViewerFormat format;
1083 
1084   PetscFunctionBegin;
1085   PetscCall(PetscViewerGetFormat(viewer, &format));
1086   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1087     PetscInt nofinalvalue = 0;
1088     /*
1089     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1090       nofinalvalue = 1;
1091     }
1092     */
1093     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1094     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1095     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1096 #if defined(PETSC_USE_COMPLEX)
1097     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1098 #else
1099     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1100 #endif
1101     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1102 
1103     for (i = 0; i < m; i++) {
1104       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1105       for (j = 0; j < a->rlen[i]; j++) {
1106 #if defined(PETSC_USE_COMPLEX)
1107         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1108 #else
1109         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j]));
1110 #endif
1111       }
1112     }
1113     /*
1114     if (nofinalvalue) {
1115 #if defined(PETSC_USE_COMPLEX)
1116       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1117 #else
1118       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1119 #endif
1120     }
1121     */
1122     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1123     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1124     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1125   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1126     PetscFunctionReturn(PETSC_SUCCESS);
1127   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1128     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1129     for (i = 0; i < m; i++) {
1130       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1131       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1132       for (j = 0; j < a->rlen[i]; j++) {
1133 #if defined(PETSC_USE_COMPLEX)
1134         if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1135           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1136         } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1137           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1138         } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1139           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1140         }
1141 #else
1142         if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1143 #endif
1144       }
1145       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1146     }
1147     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1148   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1149     PetscInt    cnt = 0, jcnt;
1150     PetscScalar value;
1151 #if defined(PETSC_USE_COMPLEX)
1152     PetscBool realonly = PETSC_TRUE;
1153     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1154       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1155         realonly = PETSC_FALSE;
1156         break;
1157       }
1158     }
1159 #endif
1160 
1161     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1162     for (i = 0; i < m; i++) {
1163       jcnt  = 0;
1164       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1165       for (j = 0; j < A->cmap->n; j++) {
1166         if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) {
1167           value = a->val[cnt++];
1168           jcnt++;
1169         } else {
1170           value = 0.0;
1171         }
1172 #if defined(PETSC_USE_COMPLEX)
1173         if (realonly) {
1174           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1175         } else {
1176           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1177         }
1178 #else
1179         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1180 #endif
1181       }
1182       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1183     }
1184     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1185   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1186     PetscInt fshift = 1;
1187     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1188 #if defined(PETSC_USE_COMPLEX)
1189     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1190 #else
1191     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1192 #endif
1193     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1194     for (i = 0; i < m; i++) {
1195       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1196       for (j = 0; j < a->rlen[i]; j++) {
1197 #if defined(PETSC_USE_COMPLEX)
1198         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1199 #else
1200         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j]));
1201 #endif
1202       }
1203     }
1204     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1205   } else if (format == PETSC_VIEWER_NATIVE) {
1206     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1207       PetscInt row;
1208       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1209       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1210 #if defined(PETSC_USE_COMPLEX)
1211         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1212           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1213         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1214           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1215         } else {
1216           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1217         }
1218 #else
1219         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j]));
1220 #endif
1221       }
1222     }
1223   } else {
1224     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1225     if (A->factortype) {
1226       for (i = 0; i < m; i++) {
1227         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1228         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1229         /* L part */
1230         for (j = shift; j < a->diag[i]; j += a->sliceheight) {
1231 #if defined(PETSC_USE_COMPLEX)
1232           if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) {
1233             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1234           } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) {
1235             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1236           } else {
1237             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1238           }
1239 #else
1240           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1241 #endif
1242         }
1243         /* diagonal */
1244         j = a->diag[i];
1245 #if defined(PETSC_USE_COMPLEX)
1246         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1247           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1248         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1249           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1250         } else {
1251           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1252         }
1253 #else
1254         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1255 #endif
1256 
1257         /* U part */
1258         for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) {
1259 #if defined(PETSC_USE_COMPLEX)
1260           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1261             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1262           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1263             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1264           } else {
1265             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1266           }
1267 #else
1268           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1269 #endif
1270         }
1271         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1272       }
1273     } else {
1274       for (i = 0; i < m; i++) {
1275         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1276         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1277         for (j = 0; j < a->rlen[i]; j++) {
1278 #if defined(PETSC_USE_COMPLEX)
1279           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1280             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1281           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1282             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1283           } else {
1284             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1285           }
1286 #else
1287           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1288 #endif
1289         }
1290         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1291       }
1292     }
1293     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1294   }
1295   PetscCall(PetscViewerFlush(viewer));
1296   PetscFunctionReturn(PETSC_SUCCESS);
1297 }
1298 
1299 #include <petscdraw.h>
1300 static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1301 {
1302   Mat               A = (Mat)Aa;
1303   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1304   PetscInt          i, j, m = A->rmap->n, shift;
1305   int               color;
1306   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1307   PetscViewer       viewer;
1308   PetscViewerFormat format;
1309 
1310   PetscFunctionBegin;
1311   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1312   PetscCall(PetscViewerGetFormat(viewer, &format));
1313   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1314 
1315   /* loop over matrix elements drawing boxes */
1316 
1317   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1318     PetscDrawCollectiveBegin(draw);
1319     /* Blue for negative, Cyan for zero and  Red for positive */
1320     color = PETSC_DRAW_BLUE;
1321     for (i = 0; i < m; i++) {
1322       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1323       y_l   = m - i - 1.0;
1324       y_r   = y_l + 1.0;
1325       for (j = 0; j < a->rlen[i]; j++) {
1326         x_l = a->colidx[shift + a->sliceheight * j];
1327         x_r = x_l + 1.0;
1328         if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue;
1329         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1330       }
1331     }
1332     color = PETSC_DRAW_CYAN;
1333     for (i = 0; i < m; i++) {
1334       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1335       y_l   = m - i - 1.0;
1336       y_r   = y_l + 1.0;
1337       for (j = 0; j < a->rlen[i]; j++) {
1338         x_l = a->colidx[shift + a->sliceheight * j];
1339         x_r = x_l + 1.0;
1340         if (a->val[shift + a->sliceheight * j] != 0.) continue;
1341         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1342       }
1343     }
1344     color = PETSC_DRAW_RED;
1345     for (i = 0; i < m; i++) {
1346       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1347       y_l   = m - i - 1.0;
1348       y_r   = y_l + 1.0;
1349       for (j = 0; j < a->rlen[i]; j++) {
1350         x_l = a->colidx[shift + a->sliceheight * j];
1351         x_r = x_l + 1.0;
1352         if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue;
1353         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1354       }
1355     }
1356     PetscDrawCollectiveEnd(draw);
1357   } else {
1358     /* use contour shading to indicate magnitude of values */
1359     /* first determine max of all nonzero values */
1360     PetscReal minv = 0.0, maxv = 0.0;
1361     PetscInt  count = 0;
1362     PetscDraw popup;
1363     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1364       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1365     }
1366     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1367     PetscCall(PetscDrawGetPopup(draw, &popup));
1368     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1369 
1370     PetscDrawCollectiveBegin(draw);
1371     for (i = 0; i < m; i++) {
1372       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1373       y_l   = m - i - 1.0;
1374       y_r   = y_l + 1.0;
1375       for (j = 0; j < a->rlen[i]; j++) {
1376         x_l   = a->colidx[shift + a->sliceheight * j];
1377         x_r   = x_l + 1.0;
1378         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1379         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1380         count++;
1381       }
1382     }
1383     PetscDrawCollectiveEnd(draw);
1384   }
1385   PetscFunctionReturn(PETSC_SUCCESS);
1386 }
1387 
1388 #include <petscdraw.h>
1389 static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1390 {
1391   PetscDraw draw;
1392   PetscReal xr, yr, xl, yl, h, w;
1393   PetscBool isnull;
1394 
1395   PetscFunctionBegin;
1396   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1397   PetscCall(PetscDrawIsNull(draw, &isnull));
1398   if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1399 
1400   xr = A->cmap->n;
1401   yr = A->rmap->n;
1402   h  = yr / 10.0;
1403   w  = xr / 10.0;
1404   xr += w;
1405   yr += h;
1406   xl = -w;
1407   yl = -h;
1408   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1409   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1410   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1411   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1412   PetscCall(PetscDrawSave(draw));
1413   PetscFunctionReturn(PETSC_SUCCESS);
1414 }
1415 
1416 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1417 {
1418   PetscBool iascii, isbinary, isdraw;
1419 
1420   PetscFunctionBegin;
1421   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1422   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1423   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1424   if (iascii) {
1425     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1426   } else if (isbinary) {
1427     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1428   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1429   PetscFunctionReturn(PETSC_SUCCESS);
1430 }
1431 
1432 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1433 {
1434   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1435   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1436   MatScalar   *vp;
1437 #if defined(PETSC_HAVE_CUDA)
1438   PetscInt totalchunks = 0;
1439 #endif
1440 
1441   PetscFunctionBegin;
1442   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1443   /* To do: compress out the unused elements */
1444   PetscCall(MatMarkDiagonal_SeqSELL(A));
1445   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1446   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1447   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1448   a->nonzerorowcnt = 0;
1449   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1450   for (i = 0; i < a->totalslices; ++i) {
1451     shift = a->sliidx[i];                                                   /* starting index of the slice */
1452     cp    = PetscSafePointerPlusOffset(a->colidx, shift);                   /* pointer to the column indices of the slice */
1453     vp    = PetscSafePointerPlusOffset(a->val, shift);                      /* pointer to the nonzero values of the slice */
1454     for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */
1455       row  = a->sliceheight * i + row_in_slice;
1456       nrow = a->rlen[row]; /* number of nonzeros in row */
1457       /*
1458         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1459         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1460       */
1461       lastcol = 0;
1462       if (nrow > 0) { /* nonempty row */
1463         a->nonzerorowcnt++;
1464         lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1465       } else if (!row_in_slice) {                                 /* first row of the correct slice is empty */
1466         for (j = 1; j < a->sliceheight; j++) {
1467           if (a->rlen[a->sliceheight * i + j]) {
1468             lastcol = cp[j];
1469             break;
1470           }
1471         }
1472       } else {
1473         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1474       }
1475 
1476       for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) {
1477         cp[a->sliceheight * k + row_in_slice] = lastcol;
1478         vp[a->sliceheight * k + row_in_slice] = (MatScalar)0;
1479       }
1480     }
1481   }
1482 
1483   A->info.mallocs += a->reallocs;
1484   a->reallocs = 0;
1485 
1486   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1487 #if defined(PETSC_HAVE_CUDA)
1488   if (!a->chunksize && a->totalslices) {
1489     a->chunksize = 64;
1490     while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2;
1491     totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize;
1492   }
1493   if (totalchunks != a->totalchunks) {
1494     PetscCall(PetscFree(a->chunk_slice_map));
1495     PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map));
1496     a->totalchunks = totalchunks;
1497   }
1498   j = 0;
1499   for (i = 0; i < totalchunks; i++) {
1500     while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++;
1501     a->chunk_slice_map[i] = j;
1502   }
1503 #endif
1504   PetscFunctionReturn(PETSC_SUCCESS);
1505 }
1506 
1507 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1508 {
1509   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1510 
1511   PetscFunctionBegin;
1512   info->block_size   = 1.0;
1513   info->nz_allocated = a->maxallocmat;
1514   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1515   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1516   info->assemblies   = A->num_ass;
1517   info->mallocs      = A->info.mallocs;
1518   info->memory       = 0; /* REVIEW ME */
1519   if (A->factortype) {
1520     info->fill_ratio_given  = A->info.fill_ratio_given;
1521     info->fill_ratio_needed = A->info.fill_ratio_needed;
1522     info->factor_mallocs    = A->info.factor_mallocs;
1523   } else {
1524     info->fill_ratio_given  = 0;
1525     info->fill_ratio_needed = 0;
1526     info->factor_mallocs    = 0;
1527   }
1528   PetscFunctionReturn(PETSC_SUCCESS);
1529 }
1530 
1531 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1532 {
1533   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1534   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1535   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1536   MatScalar   *vp, value;
1537 #if defined(PETSC_HAVE_CUDA)
1538   PetscBool inserted = PETSC_FALSE;
1539   PetscInt  mul      = DEVICE_MEM_ALIGN / a->sliceheight;
1540 #endif
1541 
1542   PetscFunctionBegin;
1543   for (k = 0; k < m; k++) { /* loop over added rows */
1544     row = im[k];
1545     if (row < 0) continue;
1546     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1547     shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */
1548     cp    = a->colidx + shift;                                      /* pointer to the row */
1549     vp    = a->val + shift;                                         /* pointer to the row */
1550     nrow  = a->rlen[row];
1551     low   = 0;
1552     high  = nrow;
1553 
1554     for (l = 0; l < n; l++) { /* loop over added columns */
1555       col = in[l];
1556       if (col < 0) continue;
1557       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1558       if (a->roworiented) {
1559         value = v[l + k * n];
1560       } else {
1561         value = v[k + l * m];
1562       }
1563       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1564 
1565       /* search in this row for the specified column, i indicates the column to be set */
1566       if (col <= lastcol) low = 0;
1567       else high = nrow;
1568       lastcol = col;
1569       while (high - low > 5) {
1570         t = (low + high) / 2;
1571         if (*(cp + a->sliceheight * t) > col) high = t;
1572         else low = t;
1573       }
1574       for (i = low; i < high; i++) {
1575         if (*(cp + a->sliceheight * i) > col) break;
1576         if (*(cp + a->sliceheight * i) == col) {
1577           if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value;
1578           else *(vp + a->sliceheight * i) = value;
1579 #if defined(PETSC_HAVE_CUDA)
1580           inserted = PETSC_TRUE;
1581 #endif
1582           low = i + 1;
1583           goto noinsert;
1584         }
1585       }
1586       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1587       if (nonew == 1) goto noinsert;
1588       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1589 #if defined(PETSC_HAVE_CUDA)
1590       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul);
1591 #else
1592       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1593       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1);
1594 #endif
1595       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1596       for (ii = nrow - 1; ii >= i; ii--) {
1597         *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii);
1598         *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii);
1599       }
1600       a->rlen[row]++;
1601       *(cp + a->sliceheight * i) = col;
1602       *(vp + a->sliceheight * i) = value;
1603       a->nz++;
1604       A->nonzerostate++;
1605 #if defined(PETSC_HAVE_CUDA)
1606       inserted = PETSC_TRUE;
1607 #endif
1608       low = i + 1;
1609       high++;
1610       nrow++;
1611     noinsert:;
1612     }
1613     a->rlen[row] = nrow;
1614   }
1615 #if defined(PETSC_HAVE_CUDA)
1616   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU;
1617 #endif
1618   PetscFunctionReturn(PETSC_SUCCESS);
1619 }
1620 
1621 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1622 {
1623   PetscFunctionBegin;
1624   /* If the two matrices have the same copy implementation, use fast copy. */
1625   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1626     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1627     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1628 
1629     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1630     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1631   } else {
1632     PetscCall(MatCopy_Basic(A, B, str));
1633   }
1634   PetscFunctionReturn(PETSC_SUCCESS);
1635 }
1636 
1637 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1638 {
1639   PetscFunctionBegin;
1640   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1641   PetscFunctionReturn(PETSC_SUCCESS);
1642 }
1643 
1644 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1645 {
1646   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1647 
1648   PetscFunctionBegin;
1649   *array = a->val;
1650   PetscFunctionReturn(PETSC_SUCCESS);
1651 }
1652 
1653 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1654 {
1655   PetscFunctionBegin;
1656   PetscFunctionReturn(PETSC_SUCCESS);
1657 }
1658 
1659 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1660 {
1661   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1662   MatScalar   *aval   = a->val;
1663   PetscScalar  oalpha = alpha;
1664   PetscBLASInt one    = 1, size;
1665 
1666   PetscFunctionBegin;
1667   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1668   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1669   PetscCall(PetscLogFlops(a->nz));
1670   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1671 #if defined(PETSC_HAVE_CUDA)
1672   if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU;
1673 #endif
1674   PetscFunctionReturn(PETSC_SUCCESS);
1675 }
1676 
1677 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1678 {
1679   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1680 
1681   PetscFunctionBegin;
1682   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1683   PetscCall(MatShift_Basic(Y, a));
1684   PetscFunctionReturn(PETSC_SUCCESS);
1685 }
1686 
1687 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1688 {
1689   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1690   PetscScalar       *x, sum, *t;
1691   const MatScalar   *idiag = NULL, *mdiag;
1692   const PetscScalar *b, *xb;
1693   PetscInt           n, m = A->rmap->n, i, j, shift;
1694   const PetscInt    *diag;
1695 
1696   PetscFunctionBegin;
1697   its = its * lits;
1698 
1699   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1700   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1701   a->fshift = fshift;
1702   a->omega  = omega;
1703 
1704   diag  = a->diag;
1705   t     = a->ssor_work;
1706   idiag = a->idiag;
1707   mdiag = a->mdiag;
1708 
1709   PetscCall(VecGetArray(xx, &x));
1710   PetscCall(VecGetArrayRead(bb, &b));
1711   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1712   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1713   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1714   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1715 
1716   if (flag & SOR_ZERO_INITIAL_GUESS) {
1717     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1718       for (i = 0; i < m; i++) {
1719         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1720         sum   = b[i];
1721         n     = (diag[i] - shift) / a->sliceheight;
1722         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1723         t[i] = sum;
1724         x[i] = sum * idiag[i];
1725       }
1726       xb = t;
1727       PetscCall(PetscLogFlops(a->nz));
1728     } else xb = b;
1729     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1730       for (i = m - 1; i >= 0; i--) {
1731         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1732         sum   = xb[i];
1733         n     = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1734         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1735         if (xb == b) {
1736           x[i] = sum * idiag[i];
1737         } else {
1738           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1739         }
1740       }
1741       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1742     }
1743     its--;
1744   }
1745   while (its--) {
1746     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1747       for (i = 0; i < m; i++) {
1748         /* lower */
1749         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1750         sum   = b[i];
1751         n     = (diag[i] - shift) / a->sliceheight;
1752         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1753         t[i] = sum; /* save application of the lower-triangular part */
1754         /* upper */
1755         n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1756         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1757         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1758       }
1759       xb = t;
1760       PetscCall(PetscLogFlops(2.0 * a->nz));
1761     } else xb = b;
1762     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1763       for (i = m - 1; i >= 0; i--) {
1764         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1765         sum   = xb[i];
1766         if (xb == b) {
1767           /* whole matrix (no checkpointing available) */
1768           n = a->rlen[i];
1769           for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1770           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1771         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1772           n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1773           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1774           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1775         }
1776       }
1777       if (xb == b) {
1778         PetscCall(PetscLogFlops(2.0 * a->nz));
1779       } else {
1780         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1781       }
1782     }
1783   }
1784   PetscCall(VecRestoreArray(xx, &x));
1785   PetscCall(VecRestoreArrayRead(bb, &b));
1786   PetscFunctionReturn(PETSC_SUCCESS);
1787 }
1788 
1789 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1790                                        MatGetRow_SeqSELL,
1791                                        MatRestoreRow_SeqSELL,
1792                                        MatMult_SeqSELL,
1793                                        /* 4*/ MatMultAdd_SeqSELL,
1794                                        MatMultTranspose_SeqSELL,
1795                                        MatMultTransposeAdd_SeqSELL,
1796                                        NULL,
1797                                        NULL,
1798                                        NULL,
1799                                        /* 10*/ NULL,
1800                                        NULL,
1801                                        NULL,
1802                                        MatSOR_SeqSELL,
1803                                        NULL,
1804                                        /* 15*/ MatGetInfo_SeqSELL,
1805                                        MatEqual_SeqSELL,
1806                                        MatGetDiagonal_SeqSELL,
1807                                        MatDiagonalScale_SeqSELL,
1808                                        NULL,
1809                                        /* 20*/ NULL,
1810                                        MatAssemblyEnd_SeqSELL,
1811                                        MatSetOption_SeqSELL,
1812                                        MatZeroEntries_SeqSELL,
1813                                        /* 24*/ NULL,
1814                                        NULL,
1815                                        NULL,
1816                                        NULL,
1817                                        NULL,
1818                                        /* 29*/ MatSetUp_SeqSELL,
1819                                        NULL,
1820                                        NULL,
1821                                        NULL,
1822                                        NULL,
1823                                        /* 34*/ MatDuplicate_SeqSELL,
1824                                        NULL,
1825                                        NULL,
1826                                        NULL,
1827                                        NULL,
1828                                        /* 39*/ NULL,
1829                                        NULL,
1830                                        NULL,
1831                                        MatGetValues_SeqSELL,
1832                                        MatCopy_SeqSELL,
1833                                        /* 44*/ NULL,
1834                                        MatScale_SeqSELL,
1835                                        MatShift_SeqSELL,
1836                                        NULL,
1837                                        NULL,
1838                                        /* 49*/ NULL,
1839                                        NULL,
1840                                        NULL,
1841                                        NULL,
1842                                        NULL,
1843                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1844                                        NULL,
1845                                        NULL,
1846                                        NULL,
1847                                        NULL,
1848                                        /* 59*/ NULL,
1849                                        MatDestroy_SeqSELL,
1850                                        MatView_SeqSELL,
1851                                        NULL,
1852                                        NULL,
1853                                        /* 64*/ NULL,
1854                                        NULL,
1855                                        NULL,
1856                                        NULL,
1857                                        NULL,
1858                                        /* 69*/ NULL,
1859                                        NULL,
1860                                        NULL,
1861                                        NULL,
1862                                        NULL,
1863                                        /* 74*/ NULL,
1864                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1865                                        NULL,
1866                                        NULL,
1867                                        NULL,
1868                                        /* 79*/ NULL,
1869                                        NULL,
1870                                        NULL,
1871                                        NULL,
1872                                        NULL,
1873                                        /* 84*/ NULL,
1874                                        NULL,
1875                                        NULL,
1876                                        NULL,
1877                                        NULL,
1878                                        /* 89*/ NULL,
1879                                        NULL,
1880                                        NULL,
1881                                        NULL,
1882                                        NULL,
1883                                        /* 94*/ NULL,
1884                                        NULL,
1885                                        NULL,
1886                                        NULL,
1887                                        NULL,
1888                                        /* 99*/ NULL,
1889                                        NULL,
1890                                        NULL,
1891                                        MatConjugate_SeqSELL,
1892                                        NULL,
1893                                        /*104*/ NULL,
1894                                        NULL,
1895                                        NULL,
1896                                        NULL,
1897                                        NULL,
1898                                        /*109*/ NULL,
1899                                        NULL,
1900                                        NULL,
1901                                        NULL,
1902                                        MatMissingDiagonal_SeqSELL,
1903                                        /*114*/ NULL,
1904                                        NULL,
1905                                        NULL,
1906                                        NULL,
1907                                        NULL,
1908                                        /*119*/ NULL,
1909                                        NULL,
1910                                        NULL,
1911                                        NULL,
1912                                        NULL,
1913                                        /*124*/ NULL,
1914                                        NULL,
1915                                        NULL,
1916                                        NULL,
1917                                        NULL,
1918                                        /*129*/ NULL,
1919                                        NULL,
1920                                        NULL,
1921                                        NULL,
1922                                        NULL,
1923                                        /*134*/ NULL,
1924                                        NULL,
1925                                        NULL,
1926                                        NULL,
1927                                        NULL,
1928                                        /*139*/ NULL,
1929                                        NULL,
1930                                        NULL,
1931                                        MatFDColoringSetUp_SeqXAIJ,
1932                                        NULL,
1933                                        /*144*/ NULL,
1934                                        NULL,
1935                                        NULL,
1936                                        NULL,
1937                                        NULL,
1938                                        NULL,
1939                                        /*150*/ NULL,
1940                                        NULL,
1941                                        NULL};
1942 
1943 static PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1944 {
1945   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1946 
1947   PetscFunctionBegin;
1948   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1949 
1950   /* allocate space for values if not already there */
1951   if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1952 
1953   /* copy values over */
1954   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1955   PetscFunctionReturn(PETSC_SUCCESS);
1956 }
1957 
1958 static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1959 {
1960   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1961 
1962   PetscFunctionBegin;
1963   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1964   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1965   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1966   PetscFunctionReturn(PETSC_SUCCESS);
1967 }
1968 
1969 static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio)
1970 {
1971   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1972 
1973   PetscFunctionBegin;
1974   if (a->totalslices && a->sliidx[a->totalslices]) {
1975     *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices];
1976   } else {
1977     *ratio = 0.0;
1978   }
1979   PetscFunctionReturn(PETSC_SUCCESS);
1980 }
1981 
1982 static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth)
1983 {
1984   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1985   PetscInt     i, current_slicewidth;
1986 
1987   PetscFunctionBegin;
1988   *slicewidth = 0;
1989   for (i = 0; i < a->totalslices; i++) {
1990     current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight;
1991     if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth;
1992   }
1993   PetscFunctionReturn(PETSC_SUCCESS);
1994 }
1995 
1996 static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth)
1997 {
1998   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1999 
2000   PetscFunctionBegin;
2001   *slicewidth = 0;
2002   if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; }
2003   PetscFunctionReturn(PETSC_SUCCESS);
2004 }
2005 
2006 static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance)
2007 {
2008   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2009   PetscReal    mean;
2010   PetscInt     i, totalslices = a->totalslices, *sliidx = a->sliidx;
2011 
2012   PetscFunctionBegin;
2013   *variance = 0;
2014   if (totalslices) {
2015     mean = (PetscReal)sliidx[totalslices] / totalslices;
2016     for (i = 1; i <= totalslices; i++) { *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; }
2017   }
2018   PetscFunctionReturn(PETSC_SUCCESS);
2019 }
2020 
2021 static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight)
2022 {
2023   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2024 
2025   PetscFunctionBegin;
2026   if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS);
2027   PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight);
2028   a->sliceheight = sliceheight;
2029 #if defined(PETSC_HAVE_CUDA)
2030   PetscCheck(DEVICE_MEM_ALIGN % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "DEVICE_MEM_ALIGN is not divisible by the slice height %" PetscInt_FMT, sliceheight);
2031 #endif
2032   PetscFunctionReturn(PETSC_SUCCESS);
2033 }
2034 
2035 /*@C
2036   MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix.
2037 
2038   Not Collective
2039 
2040   Input Parameter:
2041 . A - a MATSEQSELL matrix
2042 
2043   Output Parameter:
2044 . ratio - ratio of number of padded zeros to number of allocated elements
2045 
2046   Level: intermediate
2047 
2048 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2049 @*/
2050 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio)
2051 {
2052   PetscFunctionBegin;
2053   PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio));
2054   PetscFunctionReturn(PETSC_SUCCESS);
2055 }
2056 
2057 /*@C
2058   MatSeqSELLGetMaxSliceWidth - returns the maximum slice width.
2059 
2060   Not Collective
2061 
2062   Input Parameter:
2063 . A - a MATSEQSELL matrix
2064 
2065   Output Parameter:
2066 . slicewidth - maximum slice width
2067 
2068   Level: intermediate
2069 
2070 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()`
2071 @*/
2072 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth)
2073 {
2074   PetscFunctionBegin;
2075   PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth));
2076   PetscFunctionReturn(PETSC_SUCCESS);
2077 }
2078 
2079 /*@C
2080   MatSeqSELLGetAvgSliceWidth - returns the average slice width.
2081 
2082   Not Collective
2083 
2084   Input Parameter:
2085 . A - a MATSEQSELL matrix
2086 
2087   Output Parameter:
2088 . slicewidth - average slice width
2089 
2090   Level: intermediate
2091 
2092 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()`
2093 @*/
2094 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth)
2095 {
2096   PetscFunctionBegin;
2097   PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth));
2098   PetscFunctionReturn(PETSC_SUCCESS);
2099 }
2100 
2101 /*@C
2102   MatSeqSELLSetSliceHeight - sets the slice height.
2103 
2104   Not Collective
2105 
2106   Input Parameters:
2107 + A           - a MATSEQSELL matrix
2108 - sliceheight - slice height
2109 
2110   Notes:
2111   You cannot change the slice height once it have been set.
2112 
2113   The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called.
2114 
2115   Level: intermediate
2116 
2117 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()`
2118 @*/
2119 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight)
2120 {
2121   PetscFunctionBegin;
2122   PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight));
2123   PetscFunctionReturn(PETSC_SUCCESS);
2124 }
2125 
2126 /*@C
2127   MatSeqSELLGetVarSliceSize - returns the variance of the slice size.
2128 
2129   Not Collective
2130 
2131   Input Parameter:
2132 . A - a MATSEQSELL matrix
2133 
2134   Output Parameter:
2135 . variance - variance of the slice size
2136 
2137   Level: intermediate
2138 
2139 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()`
2140 @*/
2141 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance)
2142 {
2143   PetscFunctionBegin;
2144   PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance));
2145   PetscFunctionReturn(PETSC_SUCCESS);
2146 }
2147 
2148 #if defined(PETSC_HAVE_CUDA)
2149 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
2150 #endif
2151 
2152 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
2153 {
2154   Mat_SeqSELL *b;
2155   PetscMPIInt  size;
2156 
2157   PetscFunctionBegin;
2158   PetscCall(PetscCitationsRegister(citation, &cited));
2159   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2160   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
2161 
2162   PetscCall(PetscNew(&b));
2163 
2164   B->data   = (void *)b;
2165   B->ops[0] = MatOps_Values;
2166 
2167   b->row                = NULL;
2168   b->col                = NULL;
2169   b->icol               = NULL;
2170   b->reallocs           = 0;
2171   b->ignorezeroentries  = PETSC_FALSE;
2172   b->roworiented        = PETSC_TRUE;
2173   b->nonew              = 0;
2174   b->diag               = NULL;
2175   b->solve_work         = NULL;
2176   B->spptr              = NULL;
2177   b->saved_values       = NULL;
2178   b->idiag              = NULL;
2179   b->mdiag              = NULL;
2180   b->ssor_work          = NULL;
2181   b->omega              = 1.0;
2182   b->fshift             = 0.0;
2183   b->idiagvalid         = PETSC_FALSE;
2184   b->keepnonzeropattern = PETSC_FALSE;
2185   b->sliceheight        = 0;
2186 
2187   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
2188   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
2189   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
2190   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
2191   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
2192   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
2193   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
2194 #if defined(PETSC_HAVE_CUDA)
2195   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA));
2196 #endif
2197   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL));
2198   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL));
2199   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL));
2200   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL));
2201   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL));
2202 
2203   PetscObjectOptionsBegin((PetscObject)B);
2204   {
2205     PetscInt  newsh = -1;
2206     PetscBool flg;
2207 #if defined(PETSC_HAVE_CUDA)
2208     PetscInt chunksize = 0;
2209 #endif
2210 
2211     PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg));
2212     if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); }
2213 #if defined(PETSC_HAVE_CUDA)
2214     PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg));
2215     if (flg) {
2216       PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize);
2217       b->chunksize = chunksize;
2218     }
2219 #endif
2220   }
2221   PetscOptionsEnd();
2222   PetscFunctionReturn(PETSC_SUCCESS);
2223 }
2224 
2225 /*
2226  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2227  */
2228 static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
2229 {
2230   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
2231   PetscInt     i, m                           = A->rmap->n;
2232   PetscInt     totalslices = a->totalslices;
2233 
2234   PetscFunctionBegin;
2235   C->factortype = A->factortype;
2236   c->row        = NULL;
2237   c->col        = NULL;
2238   c->icol       = NULL;
2239   c->reallocs   = 0;
2240   C->assembled  = PETSC_TRUE;
2241 
2242   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2243   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2244 
2245   PetscCall(PetscMalloc1(a->sliceheight * totalslices, &c->rlen));
2246   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2247 
2248   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2249   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2250 
2251   /* allocate the matrix space */
2252   if (mallocmatspace) {
2253     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2254 
2255     c->singlemalloc = PETSC_TRUE;
2256 
2257     if (m > 0) {
2258       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2259       if (cpvalues == MAT_COPY_VALUES) {
2260         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2261       } else {
2262         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2263       }
2264     }
2265   }
2266 
2267   c->ignorezeroentries = a->ignorezeroentries;
2268   c->roworiented       = a->roworiented;
2269   c->nonew             = a->nonew;
2270   if (a->diag) {
2271     PetscCall(PetscMalloc1(m, &c->diag));
2272     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2273   } else c->diag = NULL;
2274 
2275   c->solve_work         = NULL;
2276   c->saved_values       = NULL;
2277   c->idiag              = NULL;
2278   c->ssor_work          = NULL;
2279   c->keepnonzeropattern = a->keepnonzeropattern;
2280   c->free_val           = PETSC_TRUE;
2281   c->free_colidx        = PETSC_TRUE;
2282 
2283   c->maxallocmat  = a->maxallocmat;
2284   c->maxallocrow  = a->maxallocrow;
2285   c->rlenmax      = a->rlenmax;
2286   c->nz           = a->nz;
2287   C->preallocated = PETSC_TRUE;
2288 
2289   c->nonzerorowcnt = a->nonzerorowcnt;
2290   C->nonzerostate  = A->nonzerostate;
2291 
2292   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2293   PetscFunctionReturn(PETSC_SUCCESS);
2294 }
2295 
2296 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2297 {
2298   PetscFunctionBegin;
2299   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2300   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2301   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2302   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2303   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2304   PetscFunctionReturn(PETSC_SUCCESS);
2305 }
2306 
2307 /*MC
2308    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2309    based on the sliced Ellpack format, {cite}`zhangellpack2018`
2310 
2311    Options Database Key:
2312 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2313 
2314    Level: beginner
2315 
2316 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2317 M*/
2318 
2319 /*MC
2320    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices, {cite}`zhangellpack2018`
2321 
2322    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2323    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2324   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2325   for communicators controlling multiple processes.  It is recommended that you call both of
2326   the above preallocation routines for simplicity.
2327 
2328    Options Database Key:
2329 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2330 
2331   Level: beginner
2332 
2333   Notes:
2334   This format is only supported for real scalars, double precision, and 32-bit indices (the defaults).
2335 
2336   It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2337   non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2338 
2339   Developer Notes:
2340   On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2341 
2342   The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2343 .vb
2344                             (2 0  3 4)
2345    Consider the matrix A =  (5 0  6 0)
2346                             (0 0  7 8)
2347                             (0 0  9 9)
2348 
2349    symbolically the Ellpack format can be written as
2350 
2351         (2 3 4 |)           (0 2 3 |)
2352    v =  (5 6 0 |)  colidx = (0 2 2 |)
2353         --------            ---------
2354         (7 8 |)             (2 3 |)
2355         (9 9 |)             (2 3 |)
2356 
2357     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2358     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2359     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2360 
2361     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2362 
2363 .ve
2364 
2365     See `MatMult_SeqSELL()` for how this format is used with the SIMD operations to achieve high performance.
2366 
2367 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2368 M*/
2369 
2370 /*@C
2371   MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2372 
2373   Collective
2374 
2375   Input Parameters:
2376 + comm    - MPI communicator, set to `PETSC_COMM_SELF`
2377 . m       - number of rows
2378 . n       - number of columns
2379 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2380 - rlen    - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2381 
2382   Output Parameter:
2383 . A - the matrix
2384 
2385   Level: intermediate
2386 
2387   Notes:
2388   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2389   MatXXXXSetPreallocation() paradigm instead of this routine directly.
2390   [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2391 
2392   Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2393   Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2394   allocation.
2395 
2396 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL`
2397  @*/
2398 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2399 {
2400   PetscFunctionBegin;
2401   PetscCall(MatCreate(comm, A));
2402   PetscCall(MatSetSizes(*A, m, n, m, n));
2403   PetscCall(MatSetType(*A, MATSEQSELL));
2404   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2405   PetscFunctionReturn(PETSC_SUCCESS);
2406 }
2407 
2408 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2409 {
2410   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2411   PetscInt     totalslices = a->totalslices;
2412 
2413   PetscFunctionBegin;
2414   /* If the  matrix dimensions are not equal,or no of nonzeros */
2415   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2416     *flg = PETSC_FALSE;
2417     PetscFunctionReturn(PETSC_SUCCESS);
2418   }
2419   /* if the a->colidx are the same */
2420   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2421   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2422   /* if a->val are the same */
2423   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2424   PetscFunctionReturn(PETSC_SUCCESS);
2425 }
2426 
2427 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2428 {
2429   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2430 
2431   PetscFunctionBegin;
2432   a->idiagvalid = PETSC_FALSE;
2433   PetscFunctionReturn(PETSC_SUCCESS);
2434 }
2435 
2436 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2437 {
2438 #if defined(PETSC_USE_COMPLEX)
2439   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2440   PetscInt     i;
2441   PetscScalar *val = a->val;
2442 
2443   PetscFunctionBegin;
2444   for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); }
2445   #if defined(PETSC_HAVE_CUDA)
2446   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
2447   #endif
2448 #else
2449   PetscFunctionBegin;
2450 #endif
2451   PetscFunctionReturn(PETSC_SUCCESS);
2452 }
2453