xref: /petsc/src/mat/impls/sell/seq/sell.c (revision 4e58db63d9a16e1e6d7a6b59b385108b2e578961)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited      = PETSC_FALSE;
10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14                                " year = 2018\n"
15                                "}\n";
16 
17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
18 
19   #include <immintrin.h>
20 
21   #if !defined(_MM_SCALE_8)
22     #define _MM_SCALE_8 8
23   #endif
24 
25   #if defined(__AVX512F__)
26     /* these do not work
27    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29   */
30     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
33       vec_vals = _mm512_loadu_pd(aval); \
34       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36   #elif defined(__AVX2__) && defined(__FMA__)
37     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38       vec_vals = _mm256_loadu_pd(aval); \
39       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42   #endif
43 #endif /* PETSC_HAVE_IMMINTRIN_H */
44 
45 /*@C
46  MatSeqSELLSetPreallocation - For good matrix assembly performance
47  the user should preallocate the matrix storage by setting the parameter `nz`
48  (or the array `nnz`).
49 
50  Collective
51 
52  Input Parameters:
53 +  B - The `MATSEQSELL` matrix
54 .  rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
55 -  rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
56 
57  Level: intermediate
58 
59  Notes:
60  Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
61  Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
62  allocation.
63 
64  You can call `MatGetInfo()` to get information on how effective the preallocation was;
65  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
66  You can also run with the option `-info` and look for messages with the string
67  malloc in them to see if additional memory allocation was needed.
68 
69  Developer Note:
70  Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
71  entries or columns indices.
72 
73  The maximum number of nonzeos in any row should be as accurate as possible.
74  If it is underestimated, you will get bad performance due to reallocation
75  (`MatSeqXSELLReallocateSELL()`).
76 
77  .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
78  @*/
79 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
80 {
81   PetscFunctionBegin;
82   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
83   PetscValidType(B, 1);
84   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
85   PetscFunctionReturn(PETSC_SUCCESS);
86 }
87 
88 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
89 {
90   Mat_SeqSELL *b;
91   PetscInt     i, j, totalslices;
92 #if defined(PETSC_HAVE_CUDA)
93   PetscInt rlenmax = 0;
94 #endif
95   PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
96 
97   PetscFunctionBegin;
98   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
99   if (maxallocrow == MAT_SKIP_ALLOCATION) {
100     skipallocation = PETSC_TRUE;
101     maxallocrow    = 0;
102   }
103 
104   PetscCall(PetscLayoutSetUp(B->rmap));
105   PetscCall(PetscLayoutSetUp(B->cmap));
106 
107   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
108   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
109   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
110   if (rlen) {
111     for (i = 0; i < B->rmap->n; i++) {
112       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
113       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
114     }
115   }
116 
117   B->preallocated = PETSC_TRUE;
118 
119   b = (Mat_SeqSELL *)B->data;
120 
121   if (!b->sliceheight) { /* not set yet */
122 #if defined(PETSC_HAVE_CUDA)
123     b->sliceheight = 16;
124 #else
125     b->sliceheight = 8;
126 #endif
127   }
128   totalslices    = PetscCeilInt(B->rmap->n, b->sliceheight);
129   b->totalslices = totalslices;
130   if (!skipallocation) {
131     if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n));
132 
133     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
134       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
135     }
136     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
137       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
138       else if (maxallocrow < 0) maxallocrow = 1;
139 #if defined(PETSC_HAVE_CUDA)
140       rlenmax = maxallocrow;
141       /* Pad the slice to DEVICE_MEM_ALIGN */
142       while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++;
143 #endif
144       for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow;
145     } else {
146 #if defined(PETSC_HAVE_CUDA)
147       PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight;
148 #endif
149       maxallocrow  = 0;
150       b->sliidx[0] = 0;
151       for (i = 1; i < totalslices; i++) {
152         b->sliidx[i] = 0;
153         for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); }
154 #if defined(PETSC_HAVE_CUDA)
155         rlenmax = PetscMax(b->sliidx[i], rlenmax);
156         /* Pad the slice to DEVICE_MEM_ALIGN */
157         b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul;
158 #endif
159         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
160         PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i]));
161       }
162       /* last slice */
163       b->sliidx[totalslices] = 0;
164       for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
165 #if defined(PETSC_HAVE_CUDA)
166       rlenmax                = PetscMax(b->sliidx[i], rlenmax);
167       b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul;
168 #endif
169       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
170       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices];
171     }
172 
173     /* allocate space for val, colidx, rlen */
174     /* FIXME: should B's old memory be unlogged? */
175     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
176     /* FIXME: assuming an element of the bit array takes 8 bits */
177     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
178     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
179     PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen));
180 
181     b->singlemalloc = PETSC_TRUE;
182     b->free_val     = PETSC_TRUE;
183     b->free_colidx  = PETSC_TRUE;
184   } else {
185     b->free_val    = PETSC_FALSE;
186     b->free_colidx = PETSC_FALSE;
187   }
188 
189   b->nz          = 0;
190   b->maxallocrow = maxallocrow;
191 #if defined(PETSC_HAVE_CUDA)
192   b->rlenmax = rlenmax;
193 #else
194   b->rlenmax = maxallocrow;
195 #endif
196   b->maxallocmat      = b->sliidx[totalslices];
197   B->info.nz_unneeded = (double)b->maxallocmat;
198   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
199   PetscFunctionReturn(PETSC_SUCCESS);
200 }
201 
202 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
203 {
204   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
205   PetscInt     shift;
206 
207   PetscFunctionBegin;
208   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
209   if (nz) *nz = a->rlen[row];
210   shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight);
211   if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); }
212   if (idx) {
213     PetscInt j;
214     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j];
215     *idx = a->getrowcols;
216   }
217   if (v) {
218     PetscInt j;
219     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j];
220     *v = a->getrowvals;
221   }
222   PetscFunctionReturn(PETSC_SUCCESS);
223 }
224 
225 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
226 {
227   PetscFunctionBegin;
228   PetscFunctionReturn(PETSC_SUCCESS);
229 }
230 
231 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
232 {
233   Mat          B;
234   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
235   PetscInt     i;
236 
237   PetscFunctionBegin;
238   if (reuse == MAT_REUSE_MATRIX) {
239     B = *newmat;
240     PetscCall(MatZeroEntries(B));
241   } else {
242     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
243     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
244     PetscCall(MatSetType(B, MATSEQAIJ));
245     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
246   }
247 
248   for (i = 0; i < A->rmap->n; i++) {
249     PetscInt     nz = 0, *cols = NULL;
250     PetscScalar *vals = NULL;
251 
252     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
253     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
254     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
255   }
256 
257   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
258   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
259   B->rmap->bs = A->rmap->bs;
260 
261   if (reuse == MAT_INPLACE_MATRIX) {
262     PetscCall(MatHeaderReplace(A, &B));
263   } else {
264     *newmat = B;
265   }
266   PetscFunctionReturn(PETSC_SUCCESS);
267 }
268 
269 #include <../src/mat/impls/aij/seq/aij.h>
270 
271 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
272 {
273   Mat                B;
274   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
275   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
276   const PetscInt    *cols;
277   const PetscScalar *vals;
278 
279   PetscFunctionBegin;
280 
281   if (reuse == MAT_REUSE_MATRIX) {
282     B = *newmat;
283   } else {
284     if (PetscDefined(USE_DEBUG) || !a->ilen) {
285       PetscCall(PetscMalloc1(m, &rowlengths));
286       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
287     }
288     if (PetscDefined(USE_DEBUG) && a->ilen) {
289       PetscBool eq;
290       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
291       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
292       PetscCall(PetscFree(rowlengths));
293       rowlengths = a->ilen;
294     } else if (a->ilen) rowlengths = a->ilen;
295     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
296     PetscCall(MatSetSizes(B, m, n, m, n));
297     PetscCall(MatSetType(B, MATSEQSELL));
298     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
299     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
300   }
301 
302   for (row = 0; row < m; row++) {
303     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
304     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
305     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
306   }
307   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
308   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
309   B->rmap->bs = A->rmap->bs;
310 
311   if (reuse == MAT_INPLACE_MATRIX) {
312     PetscCall(MatHeaderReplace(A, &B));
313   } else {
314     *newmat = B;
315   }
316   PetscFunctionReturn(PETSC_SUCCESS);
317 }
318 
319 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
320 {
321   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
322   PetscScalar       *y;
323   const PetscScalar *x;
324   const MatScalar   *aval        = a->val;
325   PetscInt           totalslices = a->totalslices;
326   const PetscInt    *acolidx     = a->colidx;
327   PetscInt           i, j;
328 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
329   __m512d  vec_x, vec_y, vec_vals;
330   __m256i  vec_idx;
331   __mmask8 mask;
332   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
333   __m256i  vec_idx2, vec_idx3, vec_idx4;
334 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
335   __m128i   vec_idx;
336   __m256d   vec_x, vec_y, vec_y2, vec_vals;
337   MatScalar yval;
338   PetscInt  r, rows_left, row, nnz_in_row;
339 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
340   __m128d   vec_x_tmp;
341   __m256d   vec_x, vec_y, vec_y2, vec_vals;
342   MatScalar yval;
343   PetscInt  r, rows_left, row, nnz_in_row;
344 #else
345   PetscInt     k, sliceheight = a->sliceheight;
346   PetscScalar *sum;
347 #endif
348 
349 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
350   #pragma disjoint(*x, *y, *aval)
351 #endif
352 
353   PetscFunctionBegin;
354   PetscCall(VecGetArrayRead(xx, &x));
355   PetscCall(VecGetArray(yy, &y));
356 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
357   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
358   for (i = 0; i < totalslices; i++) { /* loop over slices */
359     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
360     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
361 
362     vec_y  = _mm512_setzero_pd();
363     vec_y2 = _mm512_setzero_pd();
364     vec_y3 = _mm512_setzero_pd();
365     vec_y4 = _mm512_setzero_pd();
366 
367     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
368     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
369     case 3:
370       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
371       acolidx += 8;
372       aval += 8;
373       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
374       acolidx += 8;
375       aval += 8;
376       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
377       acolidx += 8;
378       aval += 8;
379       j += 3;
380       break;
381     case 2:
382       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
383       acolidx += 8;
384       aval += 8;
385       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
386       acolidx += 8;
387       aval += 8;
388       j += 2;
389       break;
390     case 1:
391       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
392       acolidx += 8;
393       aval += 8;
394       j += 1;
395       break;
396     }
397   #pragma novector
398     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
399       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
400       acolidx += 8;
401       aval += 8;
402       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
403       acolidx += 8;
404       aval += 8;
405       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
406       acolidx += 8;
407       aval += 8;
408       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
409       acolidx += 8;
410       aval += 8;
411     }
412 
413     vec_y = _mm512_add_pd(vec_y, vec_y2);
414     vec_y = _mm512_add_pd(vec_y, vec_y3);
415     vec_y = _mm512_add_pd(vec_y, vec_y4);
416     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
417       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
418       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
419     } else {
420       _mm512_storeu_pd(&y[8 * i], vec_y);
421     }
422   }
423 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
424   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
425   for (i = 0; i < totalslices; i++) { /* loop over full slices */
426     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
427     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
428 
429     /* last slice may have padding rows. Don't use vectorization. */
430     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
431       rows_left = A->rmap->n - 8 * i;
432       for (r = 0; r < rows_left; ++r) {
433         yval       = (MatScalar)0;
434         row        = 8 * i + r;
435         nnz_in_row = a->rlen[row];
436         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
437         y[row] = yval;
438       }
439       break;
440     }
441 
442     vec_y  = _mm256_setzero_pd();
443     vec_y2 = _mm256_setzero_pd();
444 
445   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
446   #pragma novector
447   #pragma unroll(2)
448     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
449       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
450       aval += 4;
451       acolidx += 4;
452       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
453       aval += 4;
454       acolidx += 4;
455     }
456 
457     _mm256_storeu_pd(y + i * 8, vec_y);
458     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
459   }
460 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
461   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
462   for (i = 0; i < totalslices; i++) { /* loop over full slices */
463     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
464     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
465 
466     vec_y  = _mm256_setzero_pd();
467     vec_y2 = _mm256_setzero_pd();
468 
469     /* last slice may have padding rows. Don't use vectorization. */
470     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
471       rows_left = A->rmap->n - 8 * i;
472       for (r = 0; r < rows_left; ++r) {
473         yval       = (MatScalar)0;
474         row        = 8 * i + r;
475         nnz_in_row = a->rlen[row];
476         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
477         y[row] = yval;
478       }
479       break;
480     }
481 
482   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
483   #pragma novector
484   #pragma unroll(2)
485     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
486       vec_vals  = _mm256_loadu_pd(aval);
487       vec_x_tmp = _mm_setzero_pd();
488       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
489       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
490       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
491       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
492       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
493       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
494       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
495       aval += 4;
496 
497       vec_vals  = _mm256_loadu_pd(aval);
498       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
499       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
500       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
501       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
502       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
503       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
504       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
505       aval += 4;
506     }
507 
508     _mm256_storeu_pd(y + i * 8, vec_y);
509     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
510   }
511 #else
512   PetscCall(PetscMalloc1(sliceheight, &sum));
513   for (i = 0; i < totalslices; i++) { /* loop over slices */
514     for (j = 0; j < sliceheight; j++) {
515       sum[j] = 0.0;
516       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
517     }
518     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */
519       for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j];
520     } else {
521       for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j];
522     }
523   }
524   PetscCall(PetscFree(sum));
525 #endif
526 
527   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
528   PetscCall(VecRestoreArrayRead(xx, &x));
529   PetscCall(VecRestoreArray(yy, &y));
530   PetscFunctionReturn(PETSC_SUCCESS);
531 }
532 
533 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
534 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
535 {
536   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
537   PetscScalar       *y, *z;
538   const PetscScalar *x;
539   const MatScalar   *aval        = a->val;
540   PetscInt           totalslices = a->totalslices;
541   const PetscInt    *acolidx     = a->colidx;
542   PetscInt           i, j;
543 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
544   __m512d  vec_x, vec_y, vec_vals;
545   __m256i  vec_idx;
546   __mmask8 mask;
547   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
548   __m256i  vec_idx2, vec_idx3, vec_idx4;
549 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
550   __m128d   vec_x_tmp;
551   __m256d   vec_x, vec_y, vec_y2, vec_vals;
552   MatScalar yval;
553   PetscInt  r, row, nnz_in_row;
554 #else
555   PetscInt     k, sliceheight = a->sliceheight;
556   PetscScalar *sum;
557 #endif
558 
559 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
560   #pragma disjoint(*x, *y, *aval)
561 #endif
562 
563   PetscFunctionBegin;
564   if (!a->nz) {
565     PetscCall(VecCopy(yy, zz));
566     PetscFunctionReturn(PETSC_SUCCESS);
567   }
568   PetscCall(VecGetArrayRead(xx, &x));
569   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
570 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
571   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
572   for (i = 0; i < totalslices; i++) { /* loop over slices */
573     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
574     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
575 
576     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
577       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
578       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
579     } else {
580       vec_y = _mm512_loadu_pd(&y[8 * i]);
581     }
582     vec_y2 = _mm512_setzero_pd();
583     vec_y3 = _mm512_setzero_pd();
584     vec_y4 = _mm512_setzero_pd();
585 
586     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
587     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
588     case 3:
589       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
590       acolidx += 8;
591       aval += 8;
592       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
593       acolidx += 8;
594       aval += 8;
595       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
596       acolidx += 8;
597       aval += 8;
598       j += 3;
599       break;
600     case 2:
601       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
602       acolidx += 8;
603       aval += 8;
604       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
605       acolidx += 8;
606       aval += 8;
607       j += 2;
608       break;
609     case 1:
610       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
611       acolidx += 8;
612       aval += 8;
613       j += 1;
614       break;
615     }
616   #pragma novector
617     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
618       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
619       acolidx += 8;
620       aval += 8;
621       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
622       acolidx += 8;
623       aval += 8;
624       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
625       acolidx += 8;
626       aval += 8;
627       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
628       acolidx += 8;
629       aval += 8;
630     }
631 
632     vec_y = _mm512_add_pd(vec_y, vec_y2);
633     vec_y = _mm512_add_pd(vec_y, vec_y3);
634     vec_y = _mm512_add_pd(vec_y, vec_y4);
635     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
636       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
637     } else {
638       _mm512_storeu_pd(&z[8 * i], vec_y);
639     }
640   }
641 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
642   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
643   for (i = 0; i < totalslices; i++) { /* loop over full slices */
644     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
645     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
646 
647     /* last slice may have padding rows. Don't use vectorization. */
648     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
649       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
650         row        = 8 * i + r;
651         yval       = (MatScalar)0.0;
652         nnz_in_row = a->rlen[row];
653         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
654         z[row] = y[row] + yval;
655       }
656       break;
657     }
658 
659     vec_y  = _mm256_loadu_pd(y + 8 * i);
660     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
661 
662     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
663     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
664       vec_vals  = _mm256_loadu_pd(aval);
665       vec_x_tmp = _mm_setzero_pd();
666       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
667       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
668       vec_x     = _mm256_setzero_pd();
669       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
670       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
671       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
672       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
673       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
674       aval += 4;
675 
676       vec_vals  = _mm256_loadu_pd(aval);
677       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
678       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
679       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
680       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
681       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
682       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
683       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
684       aval += 4;
685     }
686 
687     _mm256_storeu_pd(z + i * 8, vec_y);
688     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
689   }
690 #else
691   PetscCall(PetscMalloc1(sliceheight, &sum));
692   for (i = 0; i < totalslices; i++) { /* loop over slices */
693     for (j = 0; j < sliceheight; j++) {
694       sum[j] = 0.0;
695       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
696     }
697     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
698       for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
699     } else {
700       for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
701     }
702   }
703   PetscCall(PetscFree(sum));
704 #endif
705 
706   PetscCall(PetscLogFlops(2.0 * a->nz));
707   PetscCall(VecRestoreArrayRead(xx, &x));
708   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
709   PetscFunctionReturn(PETSC_SUCCESS);
710 }
711 
712 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
713 {
714   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
715   PetscScalar       *y;
716   const PetscScalar *x;
717   const MatScalar   *aval    = a->val;
718   const PetscInt    *acolidx = a->colidx;
719   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight;
720 
721 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
722   #pragma disjoint(*x, *y, *aval)
723 #endif
724 
725   PetscFunctionBegin;
726   if (A->symmetric == PETSC_BOOL3_TRUE) {
727     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
728     PetscFunctionReturn(PETSC_SUCCESS);
729   }
730   if (zz != yy) PetscCall(VecCopy(zz, yy));
731 
732   if (a->nz) {
733     PetscCall(VecGetArrayRead(xx, &x));
734     PetscCall(VecGetArray(yy, &y));
735     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
736       if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
737         for (r = 0; r < (A->rmap->n % sliceheight); ++r) {
738           row        = sliceheight * i + r;
739           nnz_in_row = a->rlen[row];
740           for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row];
741         }
742         break;
743       }
744       for (r = 0; r < sliceheight; ++r)
745         for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r];
746     }
747     PetscCall(PetscLogFlops(2.0 * a->nz));
748     PetscCall(VecRestoreArrayRead(xx, &x));
749     PetscCall(VecRestoreArray(yy, &y));
750   }
751   PetscFunctionReturn(PETSC_SUCCESS);
752 }
753 
754 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
755 {
756   PetscFunctionBegin;
757   if (A->symmetric == PETSC_BOOL3_TRUE) {
758     PetscCall(MatMult_SeqSELL(A, xx, yy));
759   } else {
760     PetscCall(VecSet(yy, 0.0));
761     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
762   }
763   PetscFunctionReturn(PETSC_SUCCESS);
764 }
765 
766 /*
767      Checks for missing diagonals
768 */
769 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
770 {
771   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
772   PetscInt    *diag, i;
773 
774   PetscFunctionBegin;
775   *missing = PETSC_FALSE;
776   if (A->rmap->n > 0 && !(a->colidx)) {
777     *missing = PETSC_TRUE;
778     if (d) *d = 0;
779     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
780   } else {
781     diag = a->diag;
782     for (i = 0; i < A->rmap->n; i++) {
783       if (diag[i] == -1) {
784         *missing = PETSC_TRUE;
785         if (d) *d = i;
786         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
787         break;
788       }
789     }
790   }
791   PetscFunctionReturn(PETSC_SUCCESS);
792 }
793 
794 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
795 {
796   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
797   PetscInt     i, j, m = A->rmap->n, shift;
798 
799   PetscFunctionBegin;
800   if (!a->diag) {
801     PetscCall(PetscMalloc1(m, &a->diag));
802     a->free_diag = PETSC_TRUE;
803   }
804   for (i = 0; i < m; i++) {                                          /* loop over rows */
805     shift      = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
806     a->diag[i] = -1;
807     for (j = 0; j < a->rlen[i]; j++) {
808       if (a->colidx[shift + a->sliceheight * j] == i) {
809         a->diag[i] = shift + a->sliceheight * j;
810         break;
811       }
812     }
813   }
814   PetscFunctionReturn(PETSC_SUCCESS);
815 }
816 
817 /*
818   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
819 */
820 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
821 {
822   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
823   PetscInt     i, *diag, m = A->rmap->n;
824   MatScalar   *val = a->val;
825   PetscScalar *idiag, *mdiag;
826 
827   PetscFunctionBegin;
828   if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS);
829   PetscCall(MatMarkDiagonal_SeqSELL(A));
830   diag = a->diag;
831   if (!a->idiag) {
832     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
833     val = a->val;
834   }
835   mdiag = a->mdiag;
836   idiag = a->idiag;
837 
838   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
839     for (i = 0; i < m; i++) {
840       mdiag[i] = val[diag[i]];
841       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
842         PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
843         PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
844         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
845         A->factorerror_zeropivot_value = 0.0;
846         A->factorerror_zeropivot_row   = i;
847       }
848       idiag[i] = 1.0 / val[diag[i]];
849     }
850     PetscCall(PetscLogFlops(m));
851   } else {
852     for (i = 0; i < m; i++) {
853       mdiag[i] = val[diag[i]];
854       idiag[i] = omega / (fshift + val[diag[i]]);
855     }
856     PetscCall(PetscLogFlops(2.0 * m));
857   }
858   a->idiagvalid = PETSC_TRUE;
859   PetscFunctionReturn(PETSC_SUCCESS);
860 }
861 
862 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
863 {
864   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
865 
866   PetscFunctionBegin;
867   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
868   PetscCall(MatSeqSELLInvalidateDiagonal(A));
869   PetscFunctionReturn(PETSC_SUCCESS);
870 }
871 
872 PetscErrorCode MatDestroy_SeqSELL(Mat A)
873 {
874   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
875 
876   PetscFunctionBegin;
877 #if defined(PETSC_USE_LOG)
878   PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
879 #endif
880   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
881   PetscCall(ISDestroy(&a->row));
882   PetscCall(ISDestroy(&a->col));
883   PetscCall(PetscFree(a->diag));
884   PetscCall(PetscFree(a->rlen));
885   PetscCall(PetscFree(a->sliidx));
886   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
887   PetscCall(PetscFree(a->solve_work));
888   PetscCall(ISDestroy(&a->icol));
889   PetscCall(PetscFree(a->saved_values));
890   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
891   PetscCall(PetscFree(A->data));
892 
893   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
894   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
895   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
896   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
897   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
898   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
899   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_SeqSELL_SeqAIJ_C", NULL));
900 #if defined(PETSC_HAVE_CUDA)
901   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_SeqSELL_SeqSELLCUDA_C", NULL));
902 #endif
903   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL));
904   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL));
905   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL));
906   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL));
907   PetscFunctionReturn(PETSC_SUCCESS);
908 }
909 
910 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
911 {
912   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
913 
914   PetscFunctionBegin;
915   switch (op) {
916   case MAT_ROW_ORIENTED:
917     a->roworiented = flg;
918     break;
919   case MAT_KEEP_NONZERO_PATTERN:
920     a->keepnonzeropattern = flg;
921     break;
922   case MAT_NEW_NONZERO_LOCATIONS:
923     a->nonew = (flg ? 0 : 1);
924     break;
925   case MAT_NEW_NONZERO_LOCATION_ERR:
926     a->nonew = (flg ? -1 : 0);
927     break;
928   case MAT_NEW_NONZERO_ALLOCATION_ERR:
929     a->nonew = (flg ? -2 : 0);
930     break;
931   case MAT_UNUSED_NONZERO_LOCATION_ERR:
932     a->nounused = (flg ? -1 : 0);
933     break;
934   case MAT_FORCE_DIAGONAL_ENTRIES:
935   case MAT_IGNORE_OFF_PROC_ENTRIES:
936   case MAT_USE_HASH_TABLE:
937   case MAT_SORTED_FULL:
938     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
939     break;
940   case MAT_SPD:
941   case MAT_SYMMETRIC:
942   case MAT_STRUCTURALLY_SYMMETRIC:
943   case MAT_HERMITIAN:
944   case MAT_SYMMETRY_ETERNAL:
945   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
946   case MAT_SPD_ETERNAL:
947     /* These options are handled directly by MatSetOption() */
948     break;
949   default:
950     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
951   }
952   PetscFunctionReturn(PETSC_SUCCESS);
953 }
954 
955 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
956 {
957   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
958   PetscInt     i, j, n, shift;
959   PetscScalar *x, zero = 0.0;
960 
961   PetscFunctionBegin;
962   PetscCall(VecGetLocalSize(v, &n));
963   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
964 
965   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
966     PetscInt *diag = a->diag;
967     PetscCall(VecGetArray(v, &x));
968     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
969     PetscCall(VecRestoreArray(v, &x));
970     PetscFunctionReturn(PETSC_SUCCESS);
971   }
972 
973   PetscCall(VecSet(v, zero));
974   PetscCall(VecGetArray(v, &x));
975   for (i = 0; i < n; i++) {                                     /* loop over rows */
976     shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
977     x[i]  = 0;
978     for (j = 0; j < a->rlen[i]; j++) {
979       if (a->colidx[shift + a->sliceheight * j] == i) {
980         x[i] = a->val[shift + a->sliceheight * j];
981         break;
982       }
983     }
984   }
985   PetscCall(VecRestoreArray(v, &x));
986   PetscFunctionReturn(PETSC_SUCCESS);
987 }
988 
989 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
990 {
991   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
992   const PetscScalar *l, *r;
993   PetscInt           i, j, m, n, row;
994 
995   PetscFunctionBegin;
996   if (ll) {
997     /* The local size is used so that VecMPI can be passed to this routine
998        by MatDiagonalScale_MPISELL */
999     PetscCall(VecGetLocalSize(ll, &m));
1000     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1001     PetscCall(VecGetArrayRead(ll, &l));
1002     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1003       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1004         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1005           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row];
1006         }
1007       } else {
1008         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; }
1009       }
1010     }
1011     PetscCall(VecRestoreArrayRead(ll, &l));
1012     PetscCall(PetscLogFlops(a->nz));
1013   }
1014   if (rr) {
1015     PetscCall(VecGetLocalSize(rr, &n));
1016     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1017     PetscCall(VecGetArrayRead(rr, &r));
1018     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
1019       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
1020         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) {
1021           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]];
1022         }
1023       } else {
1024         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
1025       }
1026     }
1027     PetscCall(VecRestoreArrayRead(rr, &r));
1028     PetscCall(PetscLogFlops(a->nz));
1029   }
1030   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1031 #if defined(PETSC_HAVE_CUDA)
1032   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1033 #endif
1034   PetscFunctionReturn(PETSC_SUCCESS);
1035 }
1036 
1037 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1038 {
1039   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1040   PetscInt    *cp, i, k, low, high, t, row, col, l;
1041   PetscInt     shift;
1042   MatScalar   *vp;
1043 
1044   PetscFunctionBegin;
1045   for (k = 0; k < m; k++) { /* loop over requested rows */
1046     row = im[k];
1047     if (row < 0) continue;
1048     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1049     shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */
1050     cp    = a->colidx + shift;                                        /* pointer to the row */
1051     vp    = a->val + shift;                                           /* pointer to the row */
1052     for (l = 0; l < n; l++) {                                         /* loop over requested columns */
1053       col = in[l];
1054       if (col < 0) continue;
1055       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1056       high = a->rlen[row];
1057       low  = 0; /* assume unsorted */
1058       while (high - low > 5) {
1059         t = (low + high) / 2;
1060         if (*(cp + a->sliceheight * t) > col) high = t;
1061         else low = t;
1062       }
1063       for (i = low; i < high; i++) {
1064         if (*(cp + a->sliceheight * i) > col) break;
1065         if (*(cp + a->sliceheight * i) == col) {
1066           *v++ = *(vp + a->sliceheight * i);
1067           goto finished;
1068         }
1069       }
1070       *v++ = 0.0;
1071     finished:;
1072     }
1073   }
1074   PetscFunctionReturn(PETSC_SUCCESS);
1075 }
1076 
1077 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1078 {
1079   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1080   PetscInt          i, j, m = A->rmap->n, shift;
1081   const char       *name;
1082   PetscViewerFormat format;
1083 
1084   PetscFunctionBegin;
1085   PetscCall(PetscViewerGetFormat(viewer, &format));
1086   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1087     PetscInt nofinalvalue = 0;
1088     /*
1089     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1090       nofinalvalue = 1;
1091     }
1092     */
1093     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1094     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1095     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1096 #if defined(PETSC_USE_COMPLEX)
1097     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1098 #else
1099     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1100 #endif
1101     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1102 
1103     for (i = 0; i < m; i++) {
1104       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1105       for (j = 0; j < a->rlen[i]; j++) {
1106 #if defined(PETSC_USE_COMPLEX)
1107         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1108 #else
1109         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j]));
1110 #endif
1111       }
1112     }
1113     /*
1114     if (nofinalvalue) {
1115 #if defined(PETSC_USE_COMPLEX)
1116       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1117 #else
1118       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1119 #endif
1120     }
1121     */
1122     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1123     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1124     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1125   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1126     PetscFunctionReturn(PETSC_SUCCESS);
1127   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1128     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1129     for (i = 0; i < m; i++) {
1130       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1131       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1132       for (j = 0; j < a->rlen[i]; j++) {
1133 #if defined(PETSC_USE_COMPLEX)
1134         if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1135           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1136         } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1137           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1138         } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1139           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1140         }
1141 #else
1142         if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1143 #endif
1144       }
1145       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1146     }
1147     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1148   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1149     PetscInt    cnt = 0, jcnt;
1150     PetscScalar value;
1151 #if defined(PETSC_USE_COMPLEX)
1152     PetscBool realonly = PETSC_TRUE;
1153     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1154       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1155         realonly = PETSC_FALSE;
1156         break;
1157       }
1158     }
1159 #endif
1160 
1161     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1162     for (i = 0; i < m; i++) {
1163       jcnt  = 0;
1164       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1165       for (j = 0; j < A->cmap->n; j++) {
1166         if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) {
1167           value = a->val[cnt++];
1168           jcnt++;
1169         } else {
1170           value = 0.0;
1171         }
1172 #if defined(PETSC_USE_COMPLEX)
1173         if (realonly) {
1174           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1175         } else {
1176           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1177         }
1178 #else
1179         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1180 #endif
1181       }
1182       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1183     }
1184     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1185   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1186     PetscInt fshift = 1;
1187     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1188 #if defined(PETSC_USE_COMPLEX)
1189     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1190 #else
1191     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1192 #endif
1193     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1194     for (i = 0; i < m; i++) {
1195       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1196       for (j = 0; j < a->rlen[i]; j++) {
1197 #if defined(PETSC_USE_COMPLEX)
1198         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1199 #else
1200         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j]));
1201 #endif
1202       }
1203     }
1204     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1205   } else if (format == PETSC_VIEWER_NATIVE) {
1206     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1207       PetscInt row;
1208       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1209       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1210 #if defined(PETSC_USE_COMPLEX)
1211         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1212           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1213         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1214           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1215         } else {
1216           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1217         }
1218 #else
1219         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j]));
1220 #endif
1221       }
1222     }
1223   } else {
1224     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1225     if (A->factortype) {
1226       for (i = 0; i < m; i++) {
1227         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1228         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1229         /* L part */
1230         for (j = shift; j < a->diag[i]; j += a->sliceheight) {
1231 #if defined(PETSC_USE_COMPLEX)
1232           if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) {
1233             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1234           } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) {
1235             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1236           } else {
1237             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1238           }
1239 #else
1240           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1241 #endif
1242         }
1243         /* diagonal */
1244         j = a->diag[i];
1245 #if defined(PETSC_USE_COMPLEX)
1246         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1247           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1248         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1249           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1250         } else {
1251           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1252         }
1253 #else
1254         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1255 #endif
1256 
1257         /* U part */
1258         for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) {
1259 #if defined(PETSC_USE_COMPLEX)
1260           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1261             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1262           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1263             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1264           } else {
1265             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1266           }
1267 #else
1268           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1269 #endif
1270         }
1271         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1272       }
1273     } else {
1274       for (i = 0; i < m; i++) {
1275         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1276         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1277         for (j = 0; j < a->rlen[i]; j++) {
1278 #if defined(PETSC_USE_COMPLEX)
1279           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1280             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1281           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1282             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1283           } else {
1284             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1285           }
1286 #else
1287           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1288 #endif
1289         }
1290         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1291       }
1292     }
1293     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1294   }
1295   PetscCall(PetscViewerFlush(viewer));
1296   PetscFunctionReturn(PETSC_SUCCESS);
1297 }
1298 
1299 #include <petscdraw.h>
1300 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1301 {
1302   Mat               A = (Mat)Aa;
1303   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1304   PetscInt          i, j, m = A->rmap->n, shift;
1305   int               color;
1306   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1307   PetscViewer       viewer;
1308   PetscViewerFormat format;
1309 
1310   PetscFunctionBegin;
1311   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1312   PetscCall(PetscViewerGetFormat(viewer, &format));
1313   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1314 
1315   /* loop over matrix elements drawing boxes */
1316 
1317   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1318     PetscDrawCollectiveBegin(draw);
1319     /* Blue for negative, Cyan for zero and  Red for positive */
1320     color = PETSC_DRAW_BLUE;
1321     for (i = 0; i < m; i++) {
1322       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1323       y_l   = m - i - 1.0;
1324       y_r   = y_l + 1.0;
1325       for (j = 0; j < a->rlen[i]; j++) {
1326         x_l = a->colidx[shift + a->sliceheight * j];
1327         x_r = x_l + 1.0;
1328         if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue;
1329         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1330       }
1331     }
1332     color = PETSC_DRAW_CYAN;
1333     for (i = 0; i < m; i++) {
1334       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1335       y_l   = m - i - 1.0;
1336       y_r   = y_l + 1.0;
1337       for (j = 0; j < a->rlen[i]; j++) {
1338         x_l = a->colidx[shift + a->sliceheight * j];
1339         x_r = x_l + 1.0;
1340         if (a->val[shift + a->sliceheight * j] != 0.) continue;
1341         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1342       }
1343     }
1344     color = PETSC_DRAW_RED;
1345     for (i = 0; i < m; i++) {
1346       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1347       y_l   = m - i - 1.0;
1348       y_r   = y_l + 1.0;
1349       for (j = 0; j < a->rlen[i]; j++) {
1350         x_l = a->colidx[shift + a->sliceheight * j];
1351         x_r = x_l + 1.0;
1352         if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue;
1353         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1354       }
1355     }
1356     PetscDrawCollectiveEnd(draw);
1357   } else {
1358     /* use contour shading to indicate magnitude of values */
1359     /* first determine max of all nonzero values */
1360     PetscReal minv = 0.0, maxv = 0.0;
1361     PetscInt  count = 0;
1362     PetscDraw popup;
1363     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1364       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1365     }
1366     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1367     PetscCall(PetscDrawGetPopup(draw, &popup));
1368     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1369 
1370     PetscDrawCollectiveBegin(draw);
1371     for (i = 0; i < m; i++) {
1372       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1373       y_l   = m - i - 1.0;
1374       y_r   = y_l + 1.0;
1375       for (j = 0; j < a->rlen[i]; j++) {
1376         x_l   = a->colidx[shift + a->sliceheight * j];
1377         x_r   = x_l + 1.0;
1378         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1379         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1380         count++;
1381       }
1382     }
1383     PetscDrawCollectiveEnd(draw);
1384   }
1385   PetscFunctionReturn(PETSC_SUCCESS);
1386 }
1387 
1388 #include <petscdraw.h>
1389 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1390 {
1391   PetscDraw draw;
1392   PetscReal xr, yr, xl, yl, h, w;
1393   PetscBool isnull;
1394 
1395   PetscFunctionBegin;
1396   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1397   PetscCall(PetscDrawIsNull(draw, &isnull));
1398   if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1399 
1400   xr = A->cmap->n;
1401   yr = A->rmap->n;
1402   h  = yr / 10.0;
1403   w  = xr / 10.0;
1404   xr += w;
1405   yr += h;
1406   xl = -w;
1407   yl = -h;
1408   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1409   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1410   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1411   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1412   PetscCall(PetscDrawSave(draw));
1413   PetscFunctionReturn(PETSC_SUCCESS);
1414 }
1415 
1416 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1417 {
1418   PetscBool iascii, isbinary, isdraw;
1419 
1420   PetscFunctionBegin;
1421   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1422   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1423   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1424   if (iascii) {
1425     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1426   } else if (isbinary) {
1427     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1428   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1429   PetscFunctionReturn(PETSC_SUCCESS);
1430 }
1431 
1432 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1433 {
1434   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1435   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1436   MatScalar   *vp;
1437 
1438   PetscFunctionBegin;
1439   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1440   /* To do: compress out the unused elements */
1441   PetscCall(MatMarkDiagonal_SeqSELL(A));
1442   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1443   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1444   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1445   a->nonzerorowcnt = 0;
1446   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1447   for (i = 0; i < a->totalslices; ++i) {
1448     shift = a->sliidx[i];                                                   /* starting index of the slice */
1449     cp    = a->colidx + shift;                                              /* pointer to the column indices of the slice */
1450     vp    = a->val + shift;                                                 /* pointer to the nonzero values of the slice */
1451     for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */
1452       row  = a->sliceheight * i + row_in_slice;
1453       nrow = a->rlen[row]; /* number of nonzeros in row */
1454       /*
1455         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1456         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1457       */
1458       lastcol = 0;
1459       if (nrow > 0) { /* nonempty row */
1460         a->nonzerorowcnt++;
1461         lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1462       } else if (!row_in_slice) {                                 /* first row of the correct slice is empty */
1463         for (j = 1; j < a->sliceheight; j++) {
1464           if (a->rlen[a->sliceheight * i + j]) {
1465             lastcol = cp[j];
1466             break;
1467           }
1468         }
1469       } else {
1470         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1471       }
1472 
1473       for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) {
1474         cp[a->sliceheight * k + row_in_slice] = lastcol;
1475         vp[a->sliceheight * k + row_in_slice] = (MatScalar)0;
1476       }
1477     }
1478   }
1479 
1480   A->info.mallocs += a->reallocs;
1481   a->reallocs = 0;
1482 
1483   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1484   PetscFunctionReturn(PETSC_SUCCESS);
1485 }
1486 
1487 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1488 {
1489   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1490 
1491   PetscFunctionBegin;
1492   info->block_size   = 1.0;
1493   info->nz_allocated = a->maxallocmat;
1494   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1495   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1496   info->assemblies   = A->num_ass;
1497   info->mallocs      = A->info.mallocs;
1498   info->memory       = 0; /* REVIEW ME */
1499   if (A->factortype) {
1500     info->fill_ratio_given  = A->info.fill_ratio_given;
1501     info->fill_ratio_needed = A->info.fill_ratio_needed;
1502     info->factor_mallocs    = A->info.factor_mallocs;
1503   } else {
1504     info->fill_ratio_given  = 0;
1505     info->fill_ratio_needed = 0;
1506     info->factor_mallocs    = 0;
1507   }
1508   PetscFunctionReturn(PETSC_SUCCESS);
1509 }
1510 
1511 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1512 {
1513   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1514   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1515   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1516   MatScalar   *vp, value;
1517 #if defined(PETSC_HAVE_CUDA)
1518   PetscBool inserted = PETSC_FALSE;
1519   PetscInt  mul      = DEVICE_MEM_ALIGN / a->sliceheight;
1520 #endif
1521 
1522   PetscFunctionBegin;
1523   for (k = 0; k < m; k++) { /* loop over added rows */
1524     row = im[k];
1525     if (row < 0) continue;
1526     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1527     shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */
1528     cp    = a->colidx + shift;                                      /* pointer to the row */
1529     vp    = a->val + shift;                                         /* pointer to the row */
1530     nrow  = a->rlen[row];
1531     low   = 0;
1532     high  = nrow;
1533 
1534     for (l = 0; l < n; l++) { /* loop over added columns */
1535       col = in[l];
1536       if (col < 0) continue;
1537       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1538       if (a->roworiented) {
1539         value = v[l + k * n];
1540       } else {
1541         value = v[k + l * m];
1542       }
1543       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1544 
1545       /* search in this row for the specified column, i indicates the column to be set */
1546       if (col <= lastcol) low = 0;
1547       else high = nrow;
1548       lastcol = col;
1549       while (high - low > 5) {
1550         t = (low + high) / 2;
1551         if (*(cp + a->sliceheight * t) > col) high = t;
1552         else low = t;
1553       }
1554       for (i = low; i < high; i++) {
1555         if (*(cp + a->sliceheight * i) > col) break;
1556         if (*(cp + a->sliceheight * i) == col) {
1557           if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value;
1558           else *(vp + a->sliceheight * i) = value;
1559 #if defined(PETSC_HAVE_CUDA)
1560           inserted = PETSC_TRUE;
1561 #endif
1562           low = i + 1;
1563           goto noinsert;
1564         }
1565       }
1566       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1567       if (nonew == 1) goto noinsert;
1568       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1569 #if defined(PETSC_HAVE_CUDA)
1570       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul);
1571 #else
1572       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1573       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1);
1574 #endif
1575       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1576       for (ii = nrow - 1; ii >= i; ii--) {
1577         *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii);
1578         *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii);
1579       }
1580       a->rlen[row]++;
1581       *(cp + a->sliceheight * i) = col;
1582       *(vp + a->sliceheight * i) = value;
1583       a->nz++;
1584       A->nonzerostate++;
1585 #if defined(PETSC_HAVE_CUDA)
1586       inserted = PETSC_TRUE;
1587 #endif
1588       low = i + 1;
1589       high++;
1590       nrow++;
1591     noinsert:;
1592     }
1593     a->rlen[row] = nrow;
1594   }
1595 #if defined(PETSC_HAVE_CUDA)
1596   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU;
1597 #endif
1598   PetscFunctionReturn(PETSC_SUCCESS);
1599 }
1600 
1601 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1602 {
1603   PetscFunctionBegin;
1604   /* If the two matrices have the same copy implementation, use fast copy. */
1605   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1606     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1607     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1608 
1609     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1610     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1611   } else {
1612     PetscCall(MatCopy_Basic(A, B, str));
1613   }
1614   PetscFunctionReturn(PETSC_SUCCESS);
1615 }
1616 
1617 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1618 {
1619   PetscFunctionBegin;
1620   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1621   PetscFunctionReturn(PETSC_SUCCESS);
1622 }
1623 
1624 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1625 {
1626   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1627 
1628   PetscFunctionBegin;
1629   *array = a->val;
1630   PetscFunctionReturn(PETSC_SUCCESS);
1631 }
1632 
1633 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1634 {
1635   PetscFunctionBegin;
1636   PetscFunctionReturn(PETSC_SUCCESS);
1637 }
1638 
1639 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1640 {
1641   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1642   PetscInt     i;
1643   MatScalar   *aval = a->val;
1644 
1645   PetscFunctionBegin;
1646   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1647 #if defined(PETSC_HAVE_CUDA)
1648   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1649 #endif
1650   PetscFunctionReturn(PETSC_SUCCESS);
1651 }
1652 
1653 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1654 {
1655   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1656   PetscInt     i;
1657   MatScalar   *aval = a->val;
1658 
1659   PetscFunctionBegin;
1660   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1661   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1662 #if defined(PETSC_HAVE_CUDA)
1663   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1664 #endif
1665   PetscFunctionReturn(PETSC_SUCCESS);
1666 }
1667 
1668 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1669 {
1670   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1671   MatScalar   *aval   = a->val;
1672   PetscScalar  oalpha = alpha;
1673   PetscBLASInt one    = 1, size;
1674 
1675   PetscFunctionBegin;
1676   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1677   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1678   PetscCall(PetscLogFlops(a->nz));
1679   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1680 #if defined(PETSC_HAVE_CUDA)
1681   if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU;
1682 #endif
1683   PetscFunctionReturn(PETSC_SUCCESS);
1684 }
1685 
1686 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1687 {
1688   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1689 
1690   PetscFunctionBegin;
1691   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1692   PetscCall(MatShift_Basic(Y, a));
1693   PetscFunctionReturn(PETSC_SUCCESS);
1694 }
1695 
1696 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1697 {
1698   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1699   PetscScalar       *x, sum, *t;
1700   const MatScalar   *idiag = NULL, *mdiag;
1701   const PetscScalar *b, *xb;
1702   PetscInt           n, m = A->rmap->n, i, j, shift;
1703   const PetscInt    *diag;
1704 
1705   PetscFunctionBegin;
1706   its = its * lits;
1707 
1708   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1709   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1710   a->fshift = fshift;
1711   a->omega  = omega;
1712 
1713   diag  = a->diag;
1714   t     = a->ssor_work;
1715   idiag = a->idiag;
1716   mdiag = a->mdiag;
1717 
1718   PetscCall(VecGetArray(xx, &x));
1719   PetscCall(VecGetArrayRead(bb, &b));
1720   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1721   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1722   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1723   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1724 
1725   if (flag & SOR_ZERO_INITIAL_GUESS) {
1726     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1727       for (i = 0; i < m; i++) {
1728         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1729         sum   = b[i];
1730         n     = (diag[i] - shift) / a->sliceheight;
1731         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1732         t[i] = sum;
1733         x[i] = sum * idiag[i];
1734       }
1735       xb = t;
1736       PetscCall(PetscLogFlops(a->nz));
1737     } else xb = b;
1738     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1739       for (i = m - 1; i >= 0; i--) {
1740         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1741         sum   = xb[i];
1742         n     = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1743         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1744         if (xb == b) {
1745           x[i] = sum * idiag[i];
1746         } else {
1747           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1748         }
1749       }
1750       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1751     }
1752     its--;
1753   }
1754   while (its--) {
1755     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1756       for (i = 0; i < m; i++) {
1757         /* lower */
1758         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1759         sum   = b[i];
1760         n     = (diag[i] - shift) / a->sliceheight;
1761         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1762         t[i] = sum; /* save application of the lower-triangular part */
1763         /* upper */
1764         n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1765         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1766         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1767       }
1768       xb = t;
1769       PetscCall(PetscLogFlops(2.0 * a->nz));
1770     } else xb = b;
1771     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1772       for (i = m - 1; i >= 0; i--) {
1773         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1774         sum   = xb[i];
1775         if (xb == b) {
1776           /* whole matrix (no checkpointing available) */
1777           n = a->rlen[i];
1778           for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1779           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1780         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1781           n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1782           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1783           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1784         }
1785       }
1786       if (xb == b) {
1787         PetscCall(PetscLogFlops(2.0 * a->nz));
1788       } else {
1789         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1790       }
1791     }
1792   }
1793   PetscCall(VecRestoreArray(xx, &x));
1794   PetscCall(VecRestoreArrayRead(bb, &b));
1795   PetscFunctionReturn(PETSC_SUCCESS);
1796 }
1797 
1798 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1799                                        MatGetRow_SeqSELL,
1800                                        MatRestoreRow_SeqSELL,
1801                                        MatMult_SeqSELL,
1802                                        /* 4*/ MatMultAdd_SeqSELL,
1803                                        MatMultTranspose_SeqSELL,
1804                                        MatMultTransposeAdd_SeqSELL,
1805                                        NULL,
1806                                        NULL,
1807                                        NULL,
1808                                        /* 10*/ NULL,
1809                                        NULL,
1810                                        NULL,
1811                                        MatSOR_SeqSELL,
1812                                        NULL,
1813                                        /* 15*/ MatGetInfo_SeqSELL,
1814                                        MatEqual_SeqSELL,
1815                                        MatGetDiagonal_SeqSELL,
1816                                        MatDiagonalScale_SeqSELL,
1817                                        NULL,
1818                                        /* 20*/ NULL,
1819                                        MatAssemblyEnd_SeqSELL,
1820                                        MatSetOption_SeqSELL,
1821                                        MatZeroEntries_SeqSELL,
1822                                        /* 24*/ NULL,
1823                                        NULL,
1824                                        NULL,
1825                                        NULL,
1826                                        NULL,
1827                                        /* 29*/ MatSetUp_SeqSELL,
1828                                        NULL,
1829                                        NULL,
1830                                        NULL,
1831                                        NULL,
1832                                        /* 34*/ MatDuplicate_SeqSELL,
1833                                        NULL,
1834                                        NULL,
1835                                        NULL,
1836                                        NULL,
1837                                        /* 39*/ NULL,
1838                                        NULL,
1839                                        NULL,
1840                                        MatGetValues_SeqSELL,
1841                                        MatCopy_SeqSELL,
1842                                        /* 44*/ NULL,
1843                                        MatScale_SeqSELL,
1844                                        MatShift_SeqSELL,
1845                                        NULL,
1846                                        NULL,
1847                                        /* 49*/ NULL,
1848                                        NULL,
1849                                        NULL,
1850                                        NULL,
1851                                        NULL,
1852                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1853                                        NULL,
1854                                        NULL,
1855                                        NULL,
1856                                        NULL,
1857                                        /* 59*/ NULL,
1858                                        MatDestroy_SeqSELL,
1859                                        MatView_SeqSELL,
1860                                        NULL,
1861                                        NULL,
1862                                        /* 64*/ NULL,
1863                                        NULL,
1864                                        NULL,
1865                                        NULL,
1866                                        NULL,
1867                                        /* 69*/ NULL,
1868                                        NULL,
1869                                        NULL,
1870                                        NULL,
1871                                        NULL,
1872                                        /* 74*/ NULL,
1873                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1874                                        NULL,
1875                                        NULL,
1876                                        NULL,
1877                                        /* 79*/ NULL,
1878                                        NULL,
1879                                        NULL,
1880                                        NULL,
1881                                        NULL,
1882                                        /* 84*/ NULL,
1883                                        NULL,
1884                                        NULL,
1885                                        NULL,
1886                                        NULL,
1887                                        /* 89*/ NULL,
1888                                        NULL,
1889                                        NULL,
1890                                        NULL,
1891                                        NULL,
1892                                        /* 94*/ NULL,
1893                                        NULL,
1894                                        NULL,
1895                                        NULL,
1896                                        NULL,
1897                                        /* 99*/ NULL,
1898                                        NULL,
1899                                        NULL,
1900                                        MatConjugate_SeqSELL,
1901                                        NULL,
1902                                        /*104*/ NULL,
1903                                        NULL,
1904                                        NULL,
1905                                        NULL,
1906                                        NULL,
1907                                        /*109*/ NULL,
1908                                        NULL,
1909                                        NULL,
1910                                        NULL,
1911                                        MatMissingDiagonal_SeqSELL,
1912                                        /*114*/ NULL,
1913                                        NULL,
1914                                        NULL,
1915                                        NULL,
1916                                        NULL,
1917                                        /*119*/ NULL,
1918                                        NULL,
1919                                        NULL,
1920                                        NULL,
1921                                        NULL,
1922                                        /*124*/ NULL,
1923                                        NULL,
1924                                        NULL,
1925                                        NULL,
1926                                        NULL,
1927                                        /*129*/ NULL,
1928                                        NULL,
1929                                        NULL,
1930                                        NULL,
1931                                        NULL,
1932                                        /*134*/ NULL,
1933                                        NULL,
1934                                        NULL,
1935                                        NULL,
1936                                        NULL,
1937                                        /*139*/ NULL,
1938                                        NULL,
1939                                        NULL,
1940                                        MatFDColoringSetUp_SeqXAIJ,
1941                                        NULL,
1942                                        /*144*/ NULL,
1943                                        NULL,
1944                                        NULL,
1945                                        NULL,
1946                                        NULL,
1947                                        NULL,
1948                                        /*150*/ NULL,
1949                                        NULL};
1950 
1951 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1952 {
1953   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1954 
1955   PetscFunctionBegin;
1956   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1957 
1958   /* allocate space for values if not already there */
1959   if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1960 
1961   /* copy values over */
1962   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1963   PetscFunctionReturn(PETSC_SUCCESS);
1964 }
1965 
1966 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1967 {
1968   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1969 
1970   PetscFunctionBegin;
1971   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1972   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1973   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1974   PetscFunctionReturn(PETSC_SUCCESS);
1975 }
1976 
1977 PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio)
1978 {
1979   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1980 
1981   PetscFunctionBegin;
1982   if (a->totalslices && a->sliidx[a->totalslices]) {
1983     *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices];
1984   } else {
1985     *ratio = 0.0;
1986   }
1987   PetscFunctionReturn(PETSC_SUCCESS);
1988 }
1989 
1990 PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth)
1991 {
1992   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1993   PetscInt     i, current_slicewidth;
1994 
1995   PetscFunctionBegin;
1996   *slicewidth = 0;
1997   for (i = 0; i < a->totalslices; i++) {
1998     current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight;
1999     if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth;
2000   }
2001   PetscFunctionReturn(PETSC_SUCCESS);
2002 }
2003 
2004 PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth)
2005 {
2006   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
2007 
2008   PetscFunctionBegin;
2009   *slicewidth = 0;
2010   if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; }
2011   PetscFunctionReturn(PETSC_SUCCESS);
2012 }
2013 
2014 PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight)
2015 {
2016   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2017 
2018   PetscFunctionBegin;
2019   if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS);
2020   PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight);
2021   a->sliceheight = sliceheight;
2022 #if defined(PETSC_HAVE_CUDA)
2023   PetscCheck(DEVICE_MEM_ALIGN % sliceheight == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "DEVICE_MEM_ALIGN is not divisible by the slice height %" PetscInt_FMT, sliceheight);
2024 #endif
2025   PetscFunctionReturn(PETSC_SUCCESS);
2026 }
2027 
2028 /*@C
2029   MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()`
2030 
2031   Not Collective
2032 
2033   Input Parameters:
2034 +  A - a `MATSEQSELL` matrix
2035 -  array - pointer to the data
2036 
2037   Level: intermediate
2038 
2039 .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
2040 @*/
2041 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array)
2042 {
2043   PetscFunctionBegin;
2044   PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
2045   PetscFunctionReturn(PETSC_SUCCESS);
2046 }
2047 
2048 /*@C
2049   MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix.
2050 
2051   Not Collective
2052 
2053   Input Parameter:
2054 .  A - a MATSEQSELL matrix
2055 
2056   Output Parameter:
2057 .  ratio - ratio of number of padded zeros to number of allocated elements
2058 
2059   Level: intermediate
2060 @*/
2061 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio)
2062 {
2063   PetscFunctionBegin;
2064   PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio));
2065   PetscFunctionReturn(PETSC_SUCCESS);
2066 }
2067 
2068 /*@C
2069   MatSeqSELLGetMaxSliceWidth - returns the maximum slice width.
2070 
2071   Not Collective
2072 
2073   Input Parameter:
2074 .  A - a MATSEQSELL matrix
2075 
2076   Output Parameter:
2077 .  slicewidth - maximum slice width
2078 
2079  Level: intermediate
2080 @*/
2081 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth)
2082 {
2083   PetscFunctionBegin;
2084   PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth));
2085   PetscFunctionReturn(PETSC_SUCCESS);
2086 }
2087 
2088 /*@C
2089   MatSeqSELLGetAvgSliceWidth - returns the average slice width.
2090 
2091   Not Collective
2092 
2093   Input Parameter:
2094 .  A - a MATSEQSELL matrix
2095 
2096   Output Parameter:
2097 .  slicewidth - average slice width
2098 
2099   Level: intermediate
2100 @*/
2101 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth)
2102 {
2103   PetscFunctionBegin;
2104   PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth));
2105   PetscFunctionReturn(PETSC_SUCCESS);
2106 }
2107 
2108 /*@C
2109   MatSeqSELLSetSliceHeight - sets the slice height.
2110 
2111   Not Collective
2112 
2113   Input Parameters:
2114 +  A - a MATSEQSELL matrix
2115 -  sliceheight - slice height
2116 
2117   Notes:
2118   You cannot change the slice height once it have been set.
2119 
2120   The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called.
2121 
2122   Level: intermediate
2123 @*/
2124 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight)
2125 {
2126   PetscFunctionBegin;
2127   PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight));
2128   PetscFunctionReturn(PETSC_SUCCESS);
2129 }
2130 
2131 /*@C
2132   MatSeqSELLGetVarSliceSize - returns the variance of the slice size.
2133 
2134   Not Collective
2135 
2136   Input Parameter:
2137 .  A - a MATSEQSELL matrix
2138 
2139   Output Parameter:
2140 .  variance - variance of the slice size
2141 
2142   Level: intermediate
2143 @*/
2144 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance)
2145 {
2146   PetscFunctionBegin;
2147   PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance));
2148   PetscFunctionReturn(PETSC_SUCCESS);
2149 }
2150 
2151 #if defined(PETSC_HAVE_CUDA)
2152 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
2153 #endif
2154 
2155 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
2156 {
2157   Mat_SeqSELL *b;
2158   PetscMPIInt  size;
2159 
2160   PetscFunctionBegin;
2161   PetscCall(PetscCitationsRegister(citation, &cited));
2162   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2163   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
2164 
2165   PetscCall(PetscNew(&b));
2166 
2167   B->data = (void *)b;
2168 
2169   PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps)));
2170 
2171   b->row                = NULL;
2172   b->col                = NULL;
2173   b->icol               = NULL;
2174   b->reallocs           = 0;
2175   b->ignorezeroentries  = PETSC_FALSE;
2176   b->roworiented        = PETSC_TRUE;
2177   b->nonew              = 0;
2178   b->diag               = NULL;
2179   b->solve_work         = NULL;
2180   B->spptr              = NULL;
2181   b->saved_values       = NULL;
2182   b->idiag              = NULL;
2183   b->mdiag              = NULL;
2184   b->ssor_work          = NULL;
2185   b->omega              = 1.0;
2186   b->fshift             = 0.0;
2187   b->idiagvalid         = PETSC_FALSE;
2188   b->keepnonzeropattern = PETSC_FALSE;
2189   b->sliceheight        = 0;
2190 
2191   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
2192   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
2193   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
2194   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
2195   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
2196   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
2197   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_SeqSELL_SeqAIJ_C", MatConvert_SeqSELL_SeqAIJ));
2198 #if defined(PETSC_HAVE_CUDA)
2199   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_SeqSELL_SeqSELLCUDA_C", MatConvert_SeqSELL_SeqSELLCUDA));
2200 #endif
2201   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL));
2202   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL));
2203   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL));
2204   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL));
2205 
2206   PetscObjectOptionsBegin((PetscObject)B);
2207   {
2208     PetscInt  newsh = -1;
2209     PetscBool flg;
2210 
2211     PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg));
2212     if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); }
2213   }
2214   PetscOptionsEnd();
2215   PetscFunctionReturn(PETSC_SUCCESS);
2216 }
2217 
2218 /*
2219  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2220  */
2221 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
2222 {
2223   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
2224   PetscInt     i, m                           = A->rmap->n;
2225   PetscInt     totalslices = a->totalslices;
2226 
2227   PetscFunctionBegin;
2228   C->factortype = A->factortype;
2229   c->row        = NULL;
2230   c->col        = NULL;
2231   c->icol       = NULL;
2232   c->reallocs   = 0;
2233   C->assembled  = PETSC_TRUE;
2234 
2235   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2236   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2237 
2238   PetscCall(PetscMalloc1(a->sliceheight * totalslices, &c->rlen));
2239   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2240 
2241   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2242   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2243 
2244   /* allocate the matrix space */
2245   if (mallocmatspace) {
2246     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2247 
2248     c->singlemalloc = PETSC_TRUE;
2249 
2250     if (m > 0) {
2251       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2252       if (cpvalues == MAT_COPY_VALUES) {
2253         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2254       } else {
2255         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2256       }
2257     }
2258   }
2259 
2260   c->ignorezeroentries = a->ignorezeroentries;
2261   c->roworiented       = a->roworiented;
2262   c->nonew             = a->nonew;
2263   if (a->diag) {
2264     PetscCall(PetscMalloc1(m, &c->diag));
2265     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2266   } else c->diag = NULL;
2267 
2268   c->solve_work         = NULL;
2269   c->saved_values       = NULL;
2270   c->idiag              = NULL;
2271   c->ssor_work          = NULL;
2272   c->keepnonzeropattern = a->keepnonzeropattern;
2273   c->free_val           = PETSC_TRUE;
2274   c->free_colidx        = PETSC_TRUE;
2275 
2276   c->maxallocmat  = a->maxallocmat;
2277   c->maxallocrow  = a->maxallocrow;
2278   c->rlenmax      = a->rlenmax;
2279   c->nz           = a->nz;
2280   C->preallocated = PETSC_TRUE;
2281 
2282   c->nonzerorowcnt = a->nonzerorowcnt;
2283   C->nonzerostate  = A->nonzerostate;
2284 
2285   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2286   PetscFunctionReturn(PETSC_SUCCESS);
2287 }
2288 
2289 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2290 {
2291   PetscFunctionBegin;
2292   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2293   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2294   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2295   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2296   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2297   PetscFunctionReturn(PETSC_SUCCESS);
2298 }
2299 
2300 /*MC
2301    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2302    based on the sliced Ellpack format
2303 
2304    Options Database Key:
2305 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2306 
2307    Level: beginner
2308 
2309 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2310 M*/
2311 
2312 /*MC
2313    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2314 
2315    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2316    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2317   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2318   for communicators controlling multiple processes.  It is recommended that you call both of
2319   the above preallocation routines for simplicity.
2320 
2321    Options Database Key:
2322 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2323 
2324   Level: beginner
2325 
2326   Notes:
2327    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2328 
2329    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2330    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2331 
2332   Developer Notes:
2333    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2334 
2335    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2336 .vb
2337                             (2 0  3 4)
2338    Consider the matrix A =  (5 0  6 0)
2339                             (0 0  7 8)
2340                             (0 0  9 9)
2341 
2342    symbolically the Ellpack format can be written as
2343 
2344         (2 3 4 |)           (0 2 3 |)
2345    v =  (5 6 0 |)  colidx = (0 2 2 |)
2346         --------            ---------
2347         (7 8 |)             (2 3 |)
2348         (9 9 |)             (2 3 |)
2349 
2350     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2351     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2352     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2353 
2354     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2355 
2356 .ve
2357 
2358       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2359 
2360  References:
2361 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2362    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2363 
2364 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2365 M*/
2366 
2367 /*@C
2368        MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2369 
2370  Collective
2371 
2372  Input Parameters:
2373 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
2374 .  m - number of rows
2375 .  n - number of columns
2376 .  rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2377 -  rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2378 
2379  Output Parameter:
2380 .  A - the matrix
2381 
2382  Level: intermediate
2383 
2384  Notes:
2385  It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2386  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2387  [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2388 
2389  Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2390  Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2391  allocation.
2392 
2393  .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2394  @*/
2395 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2396 {
2397   PetscFunctionBegin;
2398   PetscCall(MatCreate(comm, A));
2399   PetscCall(MatSetSizes(*A, m, n, m, n));
2400   PetscCall(MatSetType(*A, MATSEQSELL));
2401   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2402   PetscFunctionReturn(PETSC_SUCCESS);
2403 }
2404 
2405 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2406 {
2407   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2408   PetscInt     totalslices = a->totalslices;
2409 
2410   PetscFunctionBegin;
2411   /* If the  matrix dimensions are not equal,or no of nonzeros */
2412   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2413     *flg = PETSC_FALSE;
2414     PetscFunctionReturn(PETSC_SUCCESS);
2415   }
2416   /* if the a->colidx are the same */
2417   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2418   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2419   /* if a->val are the same */
2420   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2421   PetscFunctionReturn(PETSC_SUCCESS);
2422 }
2423 
2424 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2425 {
2426   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2427 
2428   PetscFunctionBegin;
2429   a->idiagvalid = PETSC_FALSE;
2430   PetscFunctionReturn(PETSC_SUCCESS);
2431 }
2432 
2433 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2434 {
2435 #if defined(PETSC_USE_COMPLEX)
2436   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2437   PetscInt     i;
2438   PetscScalar *val = a->val;
2439 
2440   PetscFunctionBegin;
2441   for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); }
2442   #if defined(PETSC_HAVE_CUDA)
2443   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
2444   #endif
2445 #else
2446   PetscFunctionBegin;
2447 #endif
2448   PetscFunctionReturn(PETSC_SUCCESS);
2449 }
2450