xref: /petsc/src/mat/impls/sell/seq/sell.c (revision 07e43b41f7cb79bba1664e5473a3e2df04f149fa)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited      = PETSC_FALSE;
10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14                                " year = 2018\n"
15                                "}\n";
16 
17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
18 
19   #include <immintrin.h>
20 
21   #if !defined(_MM_SCALE_8)
22     #define _MM_SCALE_8 8
23   #endif
24 
25   #if defined(__AVX512F__)
26     /* these do not work
27    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29   */
30     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
33       vec_vals = _mm512_loadu_pd(aval); \
34       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36   #elif defined(__AVX2__) && defined(__FMA__)
37     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38       vec_vals = _mm256_loadu_pd(aval); \
39       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42   #endif
43 #endif /* PETSC_HAVE_IMMINTRIN_H */
44 
45 /*@C
46  MatSeqSELLSetPreallocation - For good matrix assembly performance
47  the user should preallocate the matrix storage by setting the parameter `nz`
48  (or the array `nnz`).
49 
50  Collective
51 
52  Input Parameters:
53 +  B - The `MATSEQSELL` matrix
54 .  rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
55 -  rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
56 
57  Level: intermediate
58 
59  Notes:
60  Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
61  Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
62  allocation.
63 
64  You can call `MatGetInfo()` to get information on how effective the preallocation was;
65  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
66  You can also run with the option `-info` and look for messages with the string
67  malloc in them to see if additional memory allocation was needed.
68 
69  Developer Note:
70  Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
71  entries or columns indices.
72 
73  The maximum number of nonzeos in any row should be as accurate as possible.
74  If it is underestimated, you will get bad performance due to reallocation
75  (`MatSeqXSELLReallocateSELL()`).
76 
77  .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
78  @*/
79 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
80 {
81   PetscFunctionBegin;
82   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
83   PetscValidType(B, 1);
84   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
85   PetscFunctionReturn(PETSC_SUCCESS);
86 }
87 
88 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
89 {
90   Mat_SeqSELL *b;
91   PetscInt     i, j, totalslices;
92   PetscBool    skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
93 
94   PetscFunctionBegin;
95   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
96   if (maxallocrow == MAT_SKIP_ALLOCATION) {
97     skipallocation = PETSC_TRUE;
98     maxallocrow    = 0;
99   }
100 
101   PetscCall(PetscLayoutSetUp(B->rmap));
102   PetscCall(PetscLayoutSetUp(B->cmap));
103 
104   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
105   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
106   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
107   if (rlen) {
108     for (i = 0; i < B->rmap->n; i++) {
109       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
110       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
111     }
112   }
113 
114   B->preallocated = PETSC_TRUE;
115 
116   b = (Mat_SeqSELL *)B->data;
117 
118   if (!b->sliceheight) { /* not set yet */
119 #if defined(PETSC_HAVE_CUDA)
120     b->sliceheight = 16;
121 #else
122     b->sliceheight = 8;
123 #endif
124   }
125   totalslices    = PetscCeilInt(B->rmap->n, b->sliceheight);
126   b->totalslices = totalslices;
127   if (!skipallocation) {
128     if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n));
129 
130     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
131       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
132     }
133     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
134       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
135       else if (maxallocrow < 0) maxallocrow = 1;
136       for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow;
137     } else {
138       maxallocrow  = 0;
139       b->sliidx[0] = 0;
140       for (i = 1; i < totalslices; i++) {
141         b->sliidx[i] = 0;
142         for (j = 0; j < b->sliceheight; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); }
143         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
144         PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i]));
145       }
146       /* last slice */
147       b->sliidx[totalslices] = 0;
148       for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
149       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
150       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices];
151     }
152 
153     /* allocate space for val, colidx, rlen */
154     /* FIXME: should B's old memory be unlogged? */
155     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
156     /* FIXME: assuming an element of the bit array takes 8 bits */
157     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
158     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
159     PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen));
160 
161     b->singlemalloc = PETSC_TRUE;
162     b->free_val     = PETSC_TRUE;
163     b->free_colidx  = PETSC_TRUE;
164   } else {
165     b->free_val    = PETSC_FALSE;
166     b->free_colidx = PETSC_FALSE;
167   }
168 
169   b->nz               = 0;
170   b->maxallocrow      = maxallocrow;
171   b->rlenmax          = maxallocrow;
172   b->maxallocmat      = b->sliidx[totalslices];
173   B->info.nz_unneeded = (double)b->maxallocmat;
174   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
175   PetscFunctionReturn(PETSC_SUCCESS);
176 }
177 
178 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
179 {
180   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
181   PetscInt     shift;
182 
183   PetscFunctionBegin;
184   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
185   if (nz) *nz = a->rlen[row];
186   shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight);
187   if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); }
188   if (idx) {
189     PetscInt j;
190     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j];
191     *idx = a->getrowcols;
192   }
193   if (v) {
194     PetscInt j;
195     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j];
196     *v = a->getrowvals;
197   }
198   PetscFunctionReturn(PETSC_SUCCESS);
199 }
200 
201 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
202 {
203   PetscFunctionBegin;
204   PetscFunctionReturn(PETSC_SUCCESS);
205 }
206 
207 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
208 {
209   Mat          B;
210   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
211   PetscInt     i;
212 
213   PetscFunctionBegin;
214   if (reuse == MAT_REUSE_MATRIX) {
215     B = *newmat;
216     PetscCall(MatZeroEntries(B));
217   } else {
218     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
219     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
220     PetscCall(MatSetType(B, MATSEQAIJ));
221     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
222   }
223 
224   for (i = 0; i < A->rmap->n; i++) {
225     PetscInt     nz = 0, *cols = NULL;
226     PetscScalar *vals = NULL;
227 
228     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
229     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
230     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
231   }
232 
233   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
234   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
235   B->rmap->bs = A->rmap->bs;
236 
237   if (reuse == MAT_INPLACE_MATRIX) {
238     PetscCall(MatHeaderReplace(A, &B));
239   } else {
240     *newmat = B;
241   }
242   PetscFunctionReturn(PETSC_SUCCESS);
243 }
244 
245 #include <../src/mat/impls/aij/seq/aij.h>
246 
247 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
248 {
249   Mat                B;
250   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
251   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
252   const PetscInt    *cols;
253   const PetscScalar *vals;
254 
255   PetscFunctionBegin;
256 
257   if (reuse == MAT_REUSE_MATRIX) {
258     B = *newmat;
259   } else {
260     if (PetscDefined(USE_DEBUG) || !a->ilen) {
261       PetscCall(PetscMalloc1(m, &rowlengths));
262       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
263     }
264     if (PetscDefined(USE_DEBUG) && a->ilen) {
265       PetscBool eq;
266       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
267       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
268       PetscCall(PetscFree(rowlengths));
269       rowlengths = a->ilen;
270     } else if (a->ilen) rowlengths = a->ilen;
271     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
272     PetscCall(MatSetSizes(B, m, n, m, n));
273     PetscCall(MatSetType(B, MATSEQSELL));
274     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
275     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
276   }
277 
278   for (row = 0; row < m; row++) {
279     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
280     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
281     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
282   }
283   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
284   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
285   B->rmap->bs = A->rmap->bs;
286 
287   if (reuse == MAT_INPLACE_MATRIX) {
288     PetscCall(MatHeaderReplace(A, &B));
289   } else {
290     *newmat = B;
291   }
292   PetscFunctionReturn(PETSC_SUCCESS);
293 }
294 
295 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
296 {
297   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
298   PetscScalar       *y;
299   const PetscScalar *x;
300   const MatScalar   *aval        = a->val;
301   PetscInt           totalslices = a->totalslices;
302   const PetscInt    *acolidx     = a->colidx;
303   PetscInt           i, j;
304 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
305   __m512d  vec_x, vec_y, vec_vals;
306   __m256i  vec_idx;
307   __mmask8 mask;
308   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
309   __m256i  vec_idx2, vec_idx3, vec_idx4;
310 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
311   __m128i   vec_idx;
312   __m256d   vec_x, vec_y, vec_y2, vec_vals;
313   MatScalar yval;
314   PetscInt  r, rows_left, row, nnz_in_row;
315 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
316   __m128d   vec_x_tmp;
317   __m256d   vec_x, vec_y, vec_y2, vec_vals;
318   MatScalar yval;
319   PetscInt  r, rows_left, row, nnz_in_row;
320 #else
321   PetscInt     k, sliceheight = a->sliceheight;
322   PetscScalar *sum;
323 #endif
324 
325 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
326   #pragma disjoint(*x, *y, *aval)
327 #endif
328 
329   PetscFunctionBegin;
330   PetscCall(VecGetArrayRead(xx, &x));
331   PetscCall(VecGetArray(yy, &y));
332 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
333   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
334   for (i = 0; i < totalslices; i++) { /* loop over slices */
335     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
336     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
337 
338     vec_y  = _mm512_setzero_pd();
339     vec_y2 = _mm512_setzero_pd();
340     vec_y3 = _mm512_setzero_pd();
341     vec_y4 = _mm512_setzero_pd();
342 
343     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
344     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
345     case 3:
346       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
347       acolidx += 8;
348       aval += 8;
349       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
350       acolidx += 8;
351       aval += 8;
352       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
353       acolidx += 8;
354       aval += 8;
355       j += 3;
356       break;
357     case 2:
358       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
359       acolidx += 8;
360       aval += 8;
361       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
362       acolidx += 8;
363       aval += 8;
364       j += 2;
365       break;
366     case 1:
367       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
368       acolidx += 8;
369       aval += 8;
370       j += 1;
371       break;
372     }
373   #pragma novector
374     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
375       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
376       acolidx += 8;
377       aval += 8;
378       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
379       acolidx += 8;
380       aval += 8;
381       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
382       acolidx += 8;
383       aval += 8;
384       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
385       acolidx += 8;
386       aval += 8;
387     }
388 
389     vec_y = _mm512_add_pd(vec_y, vec_y2);
390     vec_y = _mm512_add_pd(vec_y, vec_y3);
391     vec_y = _mm512_add_pd(vec_y, vec_y4);
392     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
393       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
394       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
395     } else {
396       _mm512_storeu_pd(&y[8 * i], vec_y);
397     }
398   }
399 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
400   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
401   for (i = 0; i < totalslices; i++) { /* loop over full slices */
402     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
403     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
404 
405     /* last slice may have padding rows. Don't use vectorization. */
406     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
407       rows_left = A->rmap->n - 8 * i;
408       for (r = 0; r < rows_left; ++r) {
409         yval       = (MatScalar)0;
410         row        = 8 * i + r;
411         nnz_in_row = a->rlen[row];
412         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
413         y[row] = yval;
414       }
415       break;
416     }
417 
418     vec_y  = _mm256_setzero_pd();
419     vec_y2 = _mm256_setzero_pd();
420 
421   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
422   #pragma novector
423   #pragma unroll(2)
424     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
425       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
426       aval += 4;
427       acolidx += 4;
428       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
429       aval += 4;
430       acolidx += 4;
431     }
432 
433     _mm256_storeu_pd(y + i * 8, vec_y);
434     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
435   }
436 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
437   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
438   for (i = 0; i < totalslices; i++) { /* loop over full slices */
439     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
440     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
441 
442     vec_y  = _mm256_setzero_pd();
443     vec_y2 = _mm256_setzero_pd();
444 
445     /* last slice may have padding rows. Don't use vectorization. */
446     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
447       rows_left = A->rmap->n - 8 * i;
448       for (r = 0; r < rows_left; ++r) {
449         yval       = (MatScalar)0;
450         row        = 8 * i + r;
451         nnz_in_row = a->rlen[row];
452         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
453         y[row] = yval;
454       }
455       break;
456     }
457 
458   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
459   #pragma novector
460   #pragma unroll(2)
461     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
462       vec_vals  = _mm256_loadu_pd(aval);
463       vec_x_tmp = _mm_setzero_pd();
464       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
465       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
466       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
467       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
468       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
469       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
470       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
471       aval += 4;
472 
473       vec_vals  = _mm256_loadu_pd(aval);
474       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
475       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
476       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
477       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
478       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
479       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
480       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
481       aval += 4;
482     }
483 
484     _mm256_storeu_pd(y + i * 8, vec_y);
485     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
486   }
487 #else
488   PetscCall(PetscMalloc1(sliceheight, &sum));
489   for (i = 0; i < totalslices; i++) { /* loop over slices */
490     for (j = 0; j < sliceheight; j++) {
491       sum[j] = 0.0;
492       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
493     }
494     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */
495       for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j];
496     } else {
497       for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j];
498     }
499   }
500   PetscCall(PetscFree(sum));
501 #endif
502 
503   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
504   PetscCall(VecRestoreArrayRead(xx, &x));
505   PetscCall(VecRestoreArray(yy, &y));
506   PetscFunctionReturn(PETSC_SUCCESS);
507 }
508 
509 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
510 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
511 {
512   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
513   PetscScalar       *y, *z;
514   const PetscScalar *x;
515   const MatScalar   *aval        = a->val;
516   PetscInt           totalslices = a->totalslices;
517   const PetscInt    *acolidx     = a->colidx;
518   PetscInt           i, j;
519 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
520   __m512d  vec_x, vec_y, vec_vals;
521   __m256i  vec_idx;
522   __mmask8 mask;
523   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
524   __m256i  vec_idx2, vec_idx3, vec_idx4;
525 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
526   __m128d   vec_x_tmp;
527   __m256d   vec_x, vec_y, vec_y2, vec_vals;
528   MatScalar yval;
529   PetscInt  r, row, nnz_in_row;
530 #else
531   PetscInt     k, sliceheight = a->sliceheight;
532   PetscScalar *sum;
533 #endif
534 
535 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
536   #pragma disjoint(*x, *y, *aval)
537 #endif
538 
539   PetscFunctionBegin;
540   if (!a->nz) {
541     PetscCall(VecCopy(yy, zz));
542     PetscFunctionReturn(PETSC_SUCCESS);
543   }
544   PetscCall(VecGetArrayRead(xx, &x));
545   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
546 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
547   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
548   for (i = 0; i < totalslices; i++) { /* loop over slices */
549     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
550     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
551 
552     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
553       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
554       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
555     } else {
556       vec_y = _mm512_loadu_pd(&y[8 * i]);
557     }
558     vec_y2 = _mm512_setzero_pd();
559     vec_y3 = _mm512_setzero_pd();
560     vec_y4 = _mm512_setzero_pd();
561 
562     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
563     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
564     case 3:
565       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
566       acolidx += 8;
567       aval += 8;
568       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
569       acolidx += 8;
570       aval += 8;
571       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
572       acolidx += 8;
573       aval += 8;
574       j += 3;
575       break;
576     case 2:
577       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
578       acolidx += 8;
579       aval += 8;
580       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
581       acolidx += 8;
582       aval += 8;
583       j += 2;
584       break;
585     case 1:
586       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
587       acolidx += 8;
588       aval += 8;
589       j += 1;
590       break;
591     }
592   #pragma novector
593     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
594       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
595       acolidx += 8;
596       aval += 8;
597       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
598       acolidx += 8;
599       aval += 8;
600       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
601       acolidx += 8;
602       aval += 8;
603       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
604       acolidx += 8;
605       aval += 8;
606     }
607 
608     vec_y = _mm512_add_pd(vec_y, vec_y2);
609     vec_y = _mm512_add_pd(vec_y, vec_y3);
610     vec_y = _mm512_add_pd(vec_y, vec_y4);
611     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
612       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
613     } else {
614       _mm512_storeu_pd(&z[8 * i], vec_y);
615     }
616   }
617 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
618   PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight);
619   for (i = 0; i < totalslices; i++) { /* loop over full slices */
620     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
621     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
622 
623     /* last slice may have padding rows. Don't use vectorization. */
624     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
625       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
626         row        = 8 * i + r;
627         yval       = (MatScalar)0.0;
628         nnz_in_row = a->rlen[row];
629         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
630         z[row] = y[row] + yval;
631       }
632       break;
633     }
634 
635     vec_y  = _mm256_loadu_pd(y + 8 * i);
636     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
637 
638     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
639     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
640       vec_vals  = _mm256_loadu_pd(aval);
641       vec_x_tmp = _mm_setzero_pd();
642       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
643       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
644       vec_x     = _mm256_setzero_pd();
645       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
646       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
647       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
648       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
649       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
650       aval += 4;
651 
652       vec_vals  = _mm256_loadu_pd(aval);
653       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
654       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
655       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
656       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
657       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
658       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
659       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
660       aval += 4;
661     }
662 
663     _mm256_storeu_pd(z + i * 8, vec_y);
664     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
665   }
666 #else
667   PetscCall(PetscMalloc1(sliceheight, &sum));
668   for (i = 0; i < totalslices; i++) { /* loop over slices */
669     for (j = 0; j < sliceheight; j++) {
670       sum[j] = 0.0;
671       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]];
672     }
673     if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
674       for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
675     } else {
676       for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j];
677     }
678   }
679   PetscCall(PetscFree(sum));
680 #endif
681 
682   PetscCall(PetscLogFlops(2.0 * a->nz));
683   PetscCall(VecRestoreArrayRead(xx, &x));
684   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
685   PetscFunctionReturn(PETSC_SUCCESS);
686 }
687 
688 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
689 {
690   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
691   PetscScalar       *y;
692   const PetscScalar *x;
693   const MatScalar   *aval    = a->val;
694   const PetscInt    *acolidx = a->colidx;
695   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight;
696 
697 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
698   #pragma disjoint(*x, *y, *aval)
699 #endif
700 
701   PetscFunctionBegin;
702   if (A->symmetric == PETSC_BOOL3_TRUE) {
703     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
704     PetscFunctionReturn(PETSC_SUCCESS);
705   }
706   if (zz != yy) PetscCall(VecCopy(zz, yy));
707 
708   if (a->nz) {
709     PetscCall(VecGetArrayRead(xx, &x));
710     PetscCall(VecGetArray(yy, &y));
711     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
712       if (i == totalslices - 1 && (A->rmap->n % sliceheight)) {
713         for (r = 0; r < (A->rmap->n % sliceheight); ++r) {
714           row        = sliceheight * i + r;
715           nnz_in_row = a->rlen[row];
716           for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row];
717         }
718         break;
719       }
720       for (r = 0; r < sliceheight; ++r)
721         for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r];
722     }
723     PetscCall(PetscLogFlops(2.0 * a->nz));
724     PetscCall(VecRestoreArrayRead(xx, &x));
725     PetscCall(VecRestoreArray(yy, &y));
726   }
727   PetscFunctionReturn(PETSC_SUCCESS);
728 }
729 
730 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
731 {
732   PetscFunctionBegin;
733   if (A->symmetric == PETSC_BOOL3_TRUE) {
734     PetscCall(MatMult_SeqSELL(A, xx, yy));
735   } else {
736     PetscCall(VecSet(yy, 0.0));
737     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
738   }
739   PetscFunctionReturn(PETSC_SUCCESS);
740 }
741 
742 /*
743      Checks for missing diagonals
744 */
745 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
746 {
747   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
748   PetscInt    *diag, i;
749 
750   PetscFunctionBegin;
751   *missing = PETSC_FALSE;
752   if (A->rmap->n > 0 && !(a->colidx)) {
753     *missing = PETSC_TRUE;
754     if (d) *d = 0;
755     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
756   } else {
757     diag = a->diag;
758     for (i = 0; i < A->rmap->n; i++) {
759       if (diag[i] == -1) {
760         *missing = PETSC_TRUE;
761         if (d) *d = i;
762         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
763         break;
764       }
765     }
766   }
767   PetscFunctionReturn(PETSC_SUCCESS);
768 }
769 
770 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
771 {
772   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
773   PetscInt     i, j, m = A->rmap->n, shift;
774 
775   PetscFunctionBegin;
776   if (!a->diag) {
777     PetscCall(PetscMalloc1(m, &a->diag));
778     a->free_diag = PETSC_TRUE;
779   }
780   for (i = 0; i < m; i++) {                                          /* loop over rows */
781     shift      = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
782     a->diag[i] = -1;
783     for (j = 0; j < a->rlen[i]; j++) {
784       if (a->colidx[shift + a->sliceheight * j] == i) {
785         a->diag[i] = shift + a->sliceheight * j;
786         break;
787       }
788     }
789   }
790   PetscFunctionReturn(PETSC_SUCCESS);
791 }
792 
793 /*
794   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
795 */
796 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
797 {
798   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
799   PetscInt     i, *diag, m = A->rmap->n;
800   MatScalar   *val = a->val;
801   PetscScalar *idiag, *mdiag;
802 
803   PetscFunctionBegin;
804   if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS);
805   PetscCall(MatMarkDiagonal_SeqSELL(A));
806   diag = a->diag;
807   if (!a->idiag) {
808     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
809     val = a->val;
810   }
811   mdiag = a->mdiag;
812   idiag = a->idiag;
813 
814   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
815     for (i = 0; i < m; i++) {
816       mdiag[i] = val[diag[i]];
817       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
818         PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
819         PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
820         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
821         A->factorerror_zeropivot_value = 0.0;
822         A->factorerror_zeropivot_row   = i;
823       }
824       idiag[i] = 1.0 / val[diag[i]];
825     }
826     PetscCall(PetscLogFlops(m));
827   } else {
828     for (i = 0; i < m; i++) {
829       mdiag[i] = val[diag[i]];
830       idiag[i] = omega / (fshift + val[diag[i]]);
831     }
832     PetscCall(PetscLogFlops(2.0 * m));
833   }
834   a->idiagvalid = PETSC_TRUE;
835   PetscFunctionReturn(PETSC_SUCCESS);
836 }
837 
838 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
839 {
840   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
841 
842   PetscFunctionBegin;
843   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
844   PetscCall(MatSeqSELLInvalidateDiagonal(A));
845   PetscFunctionReturn(PETSC_SUCCESS);
846 }
847 
848 PetscErrorCode MatDestroy_SeqSELL(Mat A)
849 {
850   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
851 
852   PetscFunctionBegin;
853 #if defined(PETSC_USE_LOG)
854   PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
855 #endif
856   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
857   PetscCall(ISDestroy(&a->row));
858   PetscCall(ISDestroy(&a->col));
859   PetscCall(PetscFree(a->diag));
860   PetscCall(PetscFree(a->rlen));
861   PetscCall(PetscFree(a->sliidx));
862   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
863   PetscCall(PetscFree(a->solve_work));
864   PetscCall(ISDestroy(&a->icol));
865   PetscCall(PetscFree(a->saved_values));
866   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
867   PetscCall(PetscFree(A->data));
868 
869   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
870   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
871   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
872   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
873   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
874   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
875   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_SeqSELL_SeqAIJ_C", NULL));
876 #if defined(PETSC_HAVE_CUDA)
877   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_SeqSELL_SeqSELLCUDA_C", NULL));
878 #endif
879   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL));
880   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL));
881   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL));
882   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL));
883   PetscFunctionReturn(PETSC_SUCCESS);
884 }
885 
886 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
887 {
888   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
889 
890   PetscFunctionBegin;
891   switch (op) {
892   case MAT_ROW_ORIENTED:
893     a->roworiented = flg;
894     break;
895   case MAT_KEEP_NONZERO_PATTERN:
896     a->keepnonzeropattern = flg;
897     break;
898   case MAT_NEW_NONZERO_LOCATIONS:
899     a->nonew = (flg ? 0 : 1);
900     break;
901   case MAT_NEW_NONZERO_LOCATION_ERR:
902     a->nonew = (flg ? -1 : 0);
903     break;
904   case MAT_NEW_NONZERO_ALLOCATION_ERR:
905     a->nonew = (flg ? -2 : 0);
906     break;
907   case MAT_UNUSED_NONZERO_LOCATION_ERR:
908     a->nounused = (flg ? -1 : 0);
909     break;
910   case MAT_FORCE_DIAGONAL_ENTRIES:
911   case MAT_IGNORE_OFF_PROC_ENTRIES:
912   case MAT_USE_HASH_TABLE:
913   case MAT_SORTED_FULL:
914     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
915     break;
916   case MAT_SPD:
917   case MAT_SYMMETRIC:
918   case MAT_STRUCTURALLY_SYMMETRIC:
919   case MAT_HERMITIAN:
920   case MAT_SYMMETRY_ETERNAL:
921   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
922   case MAT_SPD_ETERNAL:
923     /* These options are handled directly by MatSetOption() */
924     break;
925   default:
926     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
927   }
928   PetscFunctionReturn(PETSC_SUCCESS);
929 }
930 
931 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
932 {
933   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
934   PetscInt     i, j, n, shift;
935   PetscScalar *x, zero = 0.0;
936 
937   PetscFunctionBegin;
938   PetscCall(VecGetLocalSize(v, &n));
939   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
940 
941   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
942     PetscInt *diag = a->diag;
943     PetscCall(VecGetArray(v, &x));
944     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
945     PetscCall(VecRestoreArray(v, &x));
946     PetscFunctionReturn(PETSC_SUCCESS);
947   }
948 
949   PetscCall(VecSet(v, zero));
950   PetscCall(VecGetArray(v, &x));
951   for (i = 0; i < n; i++) {                                     /* loop over rows */
952     shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
953     x[i]  = 0;
954     for (j = 0; j < a->rlen[i]; j++) {
955       if (a->colidx[shift + a->sliceheight * j] == i) {
956         x[i] = a->val[shift + a->sliceheight * j];
957         break;
958       }
959     }
960   }
961   PetscCall(VecRestoreArray(v, &x));
962   PetscFunctionReturn(PETSC_SUCCESS);
963 }
964 
965 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
966 {
967   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
968   const PetscScalar *l, *r;
969   PetscInt           i, j, m, n, row;
970 
971   PetscFunctionBegin;
972   if (ll) {
973     /* The local size is used so that VecMPI can be passed to this routine
974        by MatDiagonalScale_MPISELL */
975     PetscCall(VecGetLocalSize(ll, &m));
976     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
977     PetscCall(VecGetArrayRead(ll, &l));
978     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
979       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
980         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
981           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row];
982         }
983       } else {
984         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { a->val[j] *= l[a->sliceheight * i + row]; }
985       }
986     }
987     PetscCall(VecRestoreArrayRead(ll, &l));
988     PetscCall(PetscLogFlops(a->nz));
989   }
990   if (rr) {
991     PetscCall(VecGetLocalSize(rr, &n));
992     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
993     PetscCall(VecGetArrayRead(rr, &r));
994     for (i = 0; i < a->totalslices; i++) {                            /* loop over slices */
995       if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */
996         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) {
997           if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]];
998         }
999       } else {
1000         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
1001       }
1002     }
1003     PetscCall(VecRestoreArrayRead(rr, &r));
1004     PetscCall(PetscLogFlops(a->nz));
1005   }
1006   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1007 #if defined(PETSC_HAVE_CUDA)
1008   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1009 #endif
1010   PetscFunctionReturn(PETSC_SUCCESS);
1011 }
1012 
1013 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1014 {
1015   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1016   PetscInt    *cp, i, k, low, high, t, row, col, l;
1017   PetscInt     shift;
1018   MatScalar   *vp;
1019 
1020   PetscFunctionBegin;
1021   for (k = 0; k < m; k++) { /* loop over requested rows */
1022     row = im[k];
1023     if (row < 0) continue;
1024     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1025     shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */
1026     cp    = a->colidx + shift;                                        /* pointer to the row */
1027     vp    = a->val + shift;                                           /* pointer to the row */
1028     for (l = 0; l < n; l++) {                                         /* loop over requested columns */
1029       col = in[l];
1030       if (col < 0) continue;
1031       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1032       high = a->rlen[row];
1033       low  = 0; /* assume unsorted */
1034       while (high - low > 5) {
1035         t = (low + high) / 2;
1036         if (*(cp + a->sliceheight * t) > col) high = t;
1037         else low = t;
1038       }
1039       for (i = low; i < high; i++) {
1040         if (*(cp + a->sliceheight * i) > col) break;
1041         if (*(cp + a->sliceheight * i) == col) {
1042           *v++ = *(vp + a->sliceheight * i);
1043           goto finished;
1044         }
1045       }
1046       *v++ = 0.0;
1047     finished:;
1048     }
1049   }
1050   PetscFunctionReturn(PETSC_SUCCESS);
1051 }
1052 
1053 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1054 {
1055   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1056   PetscInt          i, j, m = A->rmap->n, shift;
1057   const char       *name;
1058   PetscViewerFormat format;
1059 
1060   PetscFunctionBegin;
1061   PetscCall(PetscViewerGetFormat(viewer, &format));
1062   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1063     PetscInt nofinalvalue = 0;
1064     /*
1065     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1066       nofinalvalue = 1;
1067     }
1068     */
1069     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1070     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1071     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1072 #if defined(PETSC_USE_COMPLEX)
1073     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1074 #else
1075     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1076 #endif
1077     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1078 
1079     for (i = 0; i < m; i++) {
1080       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1081       for (j = 0; j < a->rlen[i]; j++) {
1082 #if defined(PETSC_USE_COMPLEX)
1083         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1084 #else
1085         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j]));
1086 #endif
1087       }
1088     }
1089     /*
1090     if (nofinalvalue) {
1091 #if defined(PETSC_USE_COMPLEX)
1092       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1093 #else
1094       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1095 #endif
1096     }
1097     */
1098     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1099     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1100     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1101   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1102     PetscFunctionReturn(PETSC_SUCCESS);
1103   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1104     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1105     for (i = 0; i < m; i++) {
1106       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1107       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1108       for (j = 0; j < a->rlen[i]; j++) {
1109 #if defined(PETSC_USE_COMPLEX)
1110         if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1111           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1112         } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1113           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1114         } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) {
1115           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1116         }
1117 #else
1118         if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1119 #endif
1120       }
1121       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1122     }
1123     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1124   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1125     PetscInt    cnt = 0, jcnt;
1126     PetscScalar value;
1127 #if defined(PETSC_USE_COMPLEX)
1128     PetscBool realonly = PETSC_TRUE;
1129     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1130       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1131         realonly = PETSC_FALSE;
1132         break;
1133       }
1134     }
1135 #endif
1136 
1137     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1138     for (i = 0; i < m; i++) {
1139       jcnt  = 0;
1140       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1141       for (j = 0; j < A->cmap->n; j++) {
1142         if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) {
1143           value = a->val[cnt++];
1144           jcnt++;
1145         } else {
1146           value = 0.0;
1147         }
1148 #if defined(PETSC_USE_COMPLEX)
1149         if (realonly) {
1150           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1151         } else {
1152           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1153         }
1154 #else
1155         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1156 #endif
1157       }
1158       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1159     }
1160     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1161   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1162     PetscInt fshift = 1;
1163     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1164 #if defined(PETSC_USE_COMPLEX)
1165     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1166 #else
1167     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1168 #endif
1169     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1170     for (i = 0; i < m; i++) {
1171       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1172       for (j = 0; j < a->rlen[i]; j++) {
1173 #if defined(PETSC_USE_COMPLEX)
1174         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1175 #else
1176         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j]));
1177 #endif
1178       }
1179     }
1180     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1181   } else if (format == PETSC_VIEWER_NATIVE) {
1182     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1183       PetscInt row;
1184       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1185       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) {
1186 #if defined(PETSC_USE_COMPLEX)
1187         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1188           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1189         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1190           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1191         } else {
1192           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1193         }
1194 #else
1195         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j]));
1196 #endif
1197       }
1198     }
1199   } else {
1200     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1201     if (A->factortype) {
1202       for (i = 0; i < m; i++) {
1203         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1204         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1205         /* L part */
1206         for (j = shift; j < a->diag[i]; j += a->sliceheight) {
1207 #if defined(PETSC_USE_COMPLEX)
1208           if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) {
1209             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1210           } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) {
1211             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1212           } else {
1213             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1214           }
1215 #else
1216           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1217 #endif
1218         }
1219         /* diagonal */
1220         j = a->diag[i];
1221 #if defined(PETSC_USE_COMPLEX)
1222         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1223           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1224         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1225           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1226         } else {
1227           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1228         }
1229 #else
1230         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1231 #endif
1232 
1233         /* U part */
1234         for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) {
1235 #if defined(PETSC_USE_COMPLEX)
1236           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1237             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1238           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1239             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1240           } else {
1241             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1242           }
1243 #else
1244           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1245 #endif
1246         }
1247         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1248       }
1249     } else {
1250       for (i = 0; i < m; i++) {
1251         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1252         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1253         for (j = 0; j < a->rlen[i]; j++) {
1254 #if defined(PETSC_USE_COMPLEX)
1255           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1256             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1257           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1258             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j])));
1259           } else {
1260             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j])));
1261           }
1262 #else
1263           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j]));
1264 #endif
1265         }
1266         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1267       }
1268     }
1269     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1270   }
1271   PetscCall(PetscViewerFlush(viewer));
1272   PetscFunctionReturn(PETSC_SUCCESS);
1273 }
1274 
1275 #include <petscdraw.h>
1276 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1277 {
1278   Mat               A = (Mat)Aa;
1279   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1280   PetscInt          i, j, m = A->rmap->n, shift;
1281   int               color;
1282   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1283   PetscViewer       viewer;
1284   PetscViewerFormat format;
1285 
1286   PetscFunctionBegin;
1287   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1288   PetscCall(PetscViewerGetFormat(viewer, &format));
1289   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1290 
1291   /* loop over matrix elements drawing boxes */
1292 
1293   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1294     PetscDrawCollectiveBegin(draw);
1295     /* Blue for negative, Cyan for zero and  Red for positive */
1296     color = PETSC_DRAW_BLUE;
1297     for (i = 0; i < m; i++) {
1298       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1299       y_l   = m - i - 1.0;
1300       y_r   = y_l + 1.0;
1301       for (j = 0; j < a->rlen[i]; j++) {
1302         x_l = a->colidx[shift + a->sliceheight * j];
1303         x_r = x_l + 1.0;
1304         if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue;
1305         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1306       }
1307     }
1308     color = PETSC_DRAW_CYAN;
1309     for (i = 0; i < m; i++) {
1310       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1311       y_l   = m - i - 1.0;
1312       y_r   = y_l + 1.0;
1313       for (j = 0; j < a->rlen[i]; j++) {
1314         x_l = a->colidx[shift + a->sliceheight * j];
1315         x_r = x_l + 1.0;
1316         if (a->val[shift + a->sliceheight * j] != 0.) continue;
1317         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1318       }
1319     }
1320     color = PETSC_DRAW_RED;
1321     for (i = 0; i < m; i++) {
1322       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1323       y_l   = m - i - 1.0;
1324       y_r   = y_l + 1.0;
1325       for (j = 0; j < a->rlen[i]; j++) {
1326         x_l = a->colidx[shift + a->sliceheight * j];
1327         x_r = x_l + 1.0;
1328         if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue;
1329         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1330       }
1331     }
1332     PetscDrawCollectiveEnd(draw);
1333   } else {
1334     /* use contour shading to indicate magnitude of values */
1335     /* first determine max of all nonzero values */
1336     PetscReal minv = 0.0, maxv = 0.0;
1337     PetscInt  count = 0;
1338     PetscDraw popup;
1339     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1340       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1341     }
1342     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1343     PetscCall(PetscDrawGetPopup(draw, &popup));
1344     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1345 
1346     PetscDrawCollectiveBegin(draw);
1347     for (i = 0; i < m; i++) {
1348       shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight;
1349       y_l   = m - i - 1.0;
1350       y_r   = y_l + 1.0;
1351       for (j = 0; j < a->rlen[i]; j++) {
1352         x_l   = a->colidx[shift + a->sliceheight * j];
1353         x_r   = x_l + 1.0;
1354         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1355         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1356         count++;
1357       }
1358     }
1359     PetscDrawCollectiveEnd(draw);
1360   }
1361   PetscFunctionReturn(PETSC_SUCCESS);
1362 }
1363 
1364 #include <petscdraw.h>
1365 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1366 {
1367   PetscDraw draw;
1368   PetscReal xr, yr, xl, yl, h, w;
1369   PetscBool isnull;
1370 
1371   PetscFunctionBegin;
1372   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1373   PetscCall(PetscDrawIsNull(draw, &isnull));
1374   if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1375 
1376   xr = A->cmap->n;
1377   yr = A->rmap->n;
1378   h  = yr / 10.0;
1379   w  = xr / 10.0;
1380   xr += w;
1381   yr += h;
1382   xl = -w;
1383   yl = -h;
1384   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1385   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1386   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1387   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1388   PetscCall(PetscDrawSave(draw));
1389   PetscFunctionReturn(PETSC_SUCCESS);
1390 }
1391 
1392 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1393 {
1394   PetscBool iascii, isbinary, isdraw;
1395 
1396   PetscFunctionBegin;
1397   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1398   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1399   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1400   if (iascii) {
1401     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1402   } else if (isbinary) {
1403     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1404   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1405   PetscFunctionReturn(PETSC_SUCCESS);
1406 }
1407 
1408 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1409 {
1410   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1411   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1412   MatScalar   *vp;
1413 
1414   PetscFunctionBegin;
1415   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1416   /* To do: compress out the unused elements */
1417   PetscCall(MatMarkDiagonal_SeqSELL(A));
1418   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1419   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1420   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1421   a->nonzerorowcnt = 0;
1422   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1423   for (i = 0; i < a->totalslices; ++i) {
1424     shift = a->sliidx[i];                                                   /* starting index of the slice */
1425     cp    = a->colidx + shift;                                              /* pointer to the column indices of the slice */
1426     vp    = a->val + shift;                                                 /* pointer to the nonzero values of the slice */
1427     for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */
1428       row  = a->sliceheight * i + row_in_slice;
1429       nrow = a->rlen[row]; /* number of nonzeros in row */
1430       /*
1431         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1432         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1433       */
1434       lastcol = 0;
1435       if (nrow > 0) { /* nonempty row */
1436         a->nonzerorowcnt++;
1437         lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1438       } else if (!row_in_slice) {                                 /* first row of the correct slice is empty */
1439         for (j = 1; j < a->sliceheight; j++) {
1440           if (a->rlen[a->sliceheight * i + j]) {
1441             lastcol = cp[j];
1442             break;
1443           }
1444         }
1445       } else {
1446         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1447       }
1448 
1449       for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) {
1450         cp[a->sliceheight * k + row_in_slice] = lastcol;
1451         vp[a->sliceheight * k + row_in_slice] = (MatScalar)0;
1452       }
1453     }
1454   }
1455 
1456   A->info.mallocs += a->reallocs;
1457   a->reallocs = 0;
1458 
1459   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1460   PetscFunctionReturn(PETSC_SUCCESS);
1461 }
1462 
1463 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1464 {
1465   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1466 
1467   PetscFunctionBegin;
1468   info->block_size   = 1.0;
1469   info->nz_allocated = a->maxallocmat;
1470   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1471   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1472   info->assemblies   = A->num_ass;
1473   info->mallocs      = A->info.mallocs;
1474   info->memory       = 0; /* REVIEW ME */
1475   if (A->factortype) {
1476     info->fill_ratio_given  = A->info.fill_ratio_given;
1477     info->fill_ratio_needed = A->info.fill_ratio_needed;
1478     info->factor_mallocs    = A->info.factor_mallocs;
1479   } else {
1480     info->fill_ratio_given  = 0;
1481     info->fill_ratio_needed = 0;
1482     info->factor_mallocs    = 0;
1483   }
1484   PetscFunctionReturn(PETSC_SUCCESS);
1485 }
1486 
1487 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1488 {
1489   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1490   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1491   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1492   MatScalar   *vp, value;
1493 #if defined(PETSC_HAVE_CUDA)
1494   PetscBool inserted = PETSC_FALSE;
1495 #endif
1496 
1497   PetscFunctionBegin;
1498   for (k = 0; k < m; k++) { /* loop over added rows */
1499     row = im[k];
1500     if (row < 0) continue;
1501     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1502     shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */
1503     cp    = a->colidx + shift;                                      /* pointer to the row */
1504     vp    = a->val + shift;                                         /* pointer to the row */
1505     nrow  = a->rlen[row];
1506     low   = 0;
1507     high  = nrow;
1508 
1509     for (l = 0; l < n; l++) { /* loop over added columns */
1510       col = in[l];
1511       if (col < 0) continue;
1512       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1513       if (a->roworiented) {
1514         value = v[l + k * n];
1515       } else {
1516         value = v[k + l * m];
1517       }
1518       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1519 
1520       /* search in this row for the specified column, i indicates the column to be set */
1521       if (col <= lastcol) low = 0;
1522       else high = nrow;
1523       lastcol = col;
1524       while (high - low > 5) {
1525         t = (low + high) / 2;
1526         if (*(cp + a->sliceheight * t) > col) high = t;
1527         else low = t;
1528       }
1529       for (i = low; i < high; i++) {
1530         if (*(cp + a->sliceheight * i) > col) break;
1531         if (*(cp + a->sliceheight * i) == col) {
1532           if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value;
1533           else *(vp + a->sliceheight * i) = value;
1534 #if defined(PETSC_HAVE_CUDA)
1535           inserted = PETSC_TRUE;
1536 #endif
1537           low = i + 1;
1538           goto noinsert;
1539         }
1540       }
1541       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1542       if (nonew == 1) goto noinsert;
1543       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1544       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1545       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar);
1546       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1547       for (ii = nrow - 1; ii >= i; ii--) {
1548         *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii);
1549         *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii);
1550       }
1551       a->rlen[row]++;
1552       *(cp + a->sliceheight * i) = col;
1553       *(vp + a->sliceheight * i) = value;
1554       a->nz++;
1555       A->nonzerostate++;
1556 #if defined(PETSC_HAVE_CUDA)
1557       inserted = PETSC_TRUE;
1558 #endif
1559       low = i + 1;
1560       high++;
1561       nrow++;
1562     noinsert:;
1563     }
1564     a->rlen[row] = nrow;
1565   }
1566 #if defined(PETSC_HAVE_CUDA)
1567   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU;
1568 #endif
1569   PetscFunctionReturn(PETSC_SUCCESS);
1570 }
1571 
1572 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1573 {
1574   PetscFunctionBegin;
1575   /* If the two matrices have the same copy implementation, use fast copy. */
1576   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1577     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1578     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1579 
1580     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1581     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1582   } else {
1583     PetscCall(MatCopy_Basic(A, B, str));
1584   }
1585   PetscFunctionReturn(PETSC_SUCCESS);
1586 }
1587 
1588 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1589 {
1590   PetscFunctionBegin;
1591   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1592   PetscFunctionReturn(PETSC_SUCCESS);
1593 }
1594 
1595 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1596 {
1597   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1598 
1599   PetscFunctionBegin;
1600   *array = a->val;
1601   PetscFunctionReturn(PETSC_SUCCESS);
1602 }
1603 
1604 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1605 {
1606   PetscFunctionBegin;
1607   PetscFunctionReturn(PETSC_SUCCESS);
1608 }
1609 
1610 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1611 {
1612   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1613   PetscInt     i;
1614   MatScalar   *aval = a->val;
1615 
1616   PetscFunctionBegin;
1617   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1618 #if defined(PETSC_HAVE_CUDA)
1619   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1620 #endif
1621   PetscFunctionReturn(PETSC_SUCCESS);
1622 }
1623 
1624 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1625 {
1626   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1627   PetscInt     i;
1628   MatScalar   *aval = a->val;
1629 
1630   PetscFunctionBegin;
1631   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1632   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1633 #if defined(PETSC_HAVE_CUDA)
1634   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1635 #endif
1636   PetscFunctionReturn(PETSC_SUCCESS);
1637 }
1638 
1639 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1640 {
1641   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1642   MatScalar   *aval   = a->val;
1643   PetscScalar  oalpha = alpha;
1644   PetscBLASInt one    = 1, size;
1645 
1646   PetscFunctionBegin;
1647   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1648   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1649   PetscCall(PetscLogFlops(a->nz));
1650   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1651 #if defined(PETSC_HAVE_CUDA)
1652   if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU;
1653 #endif
1654   PetscFunctionReturn(PETSC_SUCCESS);
1655 }
1656 
1657 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1658 {
1659   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1660 
1661   PetscFunctionBegin;
1662   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1663   PetscCall(MatShift_Basic(Y, a));
1664   PetscFunctionReturn(PETSC_SUCCESS);
1665 }
1666 
1667 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1668 {
1669   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1670   PetscScalar       *x, sum, *t;
1671   const MatScalar   *idiag = NULL, *mdiag;
1672   const PetscScalar *b, *xb;
1673   PetscInt           n, m = A->rmap->n, i, j, shift;
1674   const PetscInt    *diag;
1675 
1676   PetscFunctionBegin;
1677   its = its * lits;
1678 
1679   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1680   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1681   a->fshift = fshift;
1682   a->omega  = omega;
1683 
1684   diag  = a->diag;
1685   t     = a->ssor_work;
1686   idiag = a->idiag;
1687   mdiag = a->mdiag;
1688 
1689   PetscCall(VecGetArray(xx, &x));
1690   PetscCall(VecGetArrayRead(bb, &b));
1691   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1692   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1693   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1694   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1695 
1696   if (flag & SOR_ZERO_INITIAL_GUESS) {
1697     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1698       for (i = 0; i < m; i++) {
1699         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1700         sum   = b[i];
1701         n     = (diag[i] - shift) / a->sliceheight;
1702         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1703         t[i] = sum;
1704         x[i] = sum * idiag[i];
1705       }
1706       xb = t;
1707       PetscCall(PetscLogFlops(a->nz));
1708     } else xb = b;
1709     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1710       for (i = m - 1; i >= 0; i--) {
1711         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1712         sum   = xb[i];
1713         n     = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1714         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1715         if (xb == b) {
1716           x[i] = sum * idiag[i];
1717         } else {
1718           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1719         }
1720       }
1721       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1722     }
1723     its--;
1724   }
1725   while (its--) {
1726     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1727       for (i = 0; i < m; i++) {
1728         /* lower */
1729         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1730         sum   = b[i];
1731         n     = (diag[i] - shift) / a->sliceheight;
1732         for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1733         t[i] = sum; /* save application of the lower-triangular part */
1734         /* upper */
1735         n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1736         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1737         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1738       }
1739       xb = t;
1740       PetscCall(PetscLogFlops(2.0 * a->nz));
1741     } else xb = b;
1742     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1743       for (i = m - 1; i >= 0; i--) {
1744         shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */
1745         sum   = xb[i];
1746         if (xb == b) {
1747           /* whole matrix (no checkpointing available) */
1748           n = a->rlen[i];
1749           for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]];
1750           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1751         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1752           n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1;
1753           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]];
1754           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1755         }
1756       }
1757       if (xb == b) {
1758         PetscCall(PetscLogFlops(2.0 * a->nz));
1759       } else {
1760         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1761       }
1762     }
1763   }
1764   PetscCall(VecRestoreArray(xx, &x));
1765   PetscCall(VecRestoreArrayRead(bb, &b));
1766   PetscFunctionReturn(PETSC_SUCCESS);
1767 }
1768 
1769 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1770                                        MatGetRow_SeqSELL,
1771                                        MatRestoreRow_SeqSELL,
1772                                        MatMult_SeqSELL,
1773                                        /* 4*/ MatMultAdd_SeqSELL,
1774                                        MatMultTranspose_SeqSELL,
1775                                        MatMultTransposeAdd_SeqSELL,
1776                                        NULL,
1777                                        NULL,
1778                                        NULL,
1779                                        /* 10*/ NULL,
1780                                        NULL,
1781                                        NULL,
1782                                        MatSOR_SeqSELL,
1783                                        NULL,
1784                                        /* 15*/ MatGetInfo_SeqSELL,
1785                                        MatEqual_SeqSELL,
1786                                        MatGetDiagonal_SeqSELL,
1787                                        MatDiagonalScale_SeqSELL,
1788                                        NULL,
1789                                        /* 20*/ NULL,
1790                                        MatAssemblyEnd_SeqSELL,
1791                                        MatSetOption_SeqSELL,
1792                                        MatZeroEntries_SeqSELL,
1793                                        /* 24*/ NULL,
1794                                        NULL,
1795                                        NULL,
1796                                        NULL,
1797                                        NULL,
1798                                        /* 29*/ MatSetUp_SeqSELL,
1799                                        NULL,
1800                                        NULL,
1801                                        NULL,
1802                                        NULL,
1803                                        /* 34*/ MatDuplicate_SeqSELL,
1804                                        NULL,
1805                                        NULL,
1806                                        NULL,
1807                                        NULL,
1808                                        /* 39*/ NULL,
1809                                        NULL,
1810                                        NULL,
1811                                        MatGetValues_SeqSELL,
1812                                        MatCopy_SeqSELL,
1813                                        /* 44*/ NULL,
1814                                        MatScale_SeqSELL,
1815                                        MatShift_SeqSELL,
1816                                        NULL,
1817                                        NULL,
1818                                        /* 49*/ NULL,
1819                                        NULL,
1820                                        NULL,
1821                                        NULL,
1822                                        NULL,
1823                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1824                                        NULL,
1825                                        NULL,
1826                                        NULL,
1827                                        NULL,
1828                                        /* 59*/ NULL,
1829                                        MatDestroy_SeqSELL,
1830                                        MatView_SeqSELL,
1831                                        NULL,
1832                                        NULL,
1833                                        /* 64*/ NULL,
1834                                        NULL,
1835                                        NULL,
1836                                        NULL,
1837                                        NULL,
1838                                        /* 69*/ NULL,
1839                                        NULL,
1840                                        NULL,
1841                                        NULL,
1842                                        NULL,
1843                                        /* 74*/ NULL,
1844                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1845                                        NULL,
1846                                        NULL,
1847                                        NULL,
1848                                        /* 79*/ NULL,
1849                                        NULL,
1850                                        NULL,
1851                                        NULL,
1852                                        NULL,
1853                                        /* 84*/ NULL,
1854                                        NULL,
1855                                        NULL,
1856                                        NULL,
1857                                        NULL,
1858                                        /* 89*/ NULL,
1859                                        NULL,
1860                                        NULL,
1861                                        NULL,
1862                                        NULL,
1863                                        /* 94*/ NULL,
1864                                        NULL,
1865                                        NULL,
1866                                        NULL,
1867                                        NULL,
1868                                        /* 99*/ NULL,
1869                                        NULL,
1870                                        NULL,
1871                                        MatConjugate_SeqSELL,
1872                                        NULL,
1873                                        /*104*/ NULL,
1874                                        NULL,
1875                                        NULL,
1876                                        NULL,
1877                                        NULL,
1878                                        /*109*/ NULL,
1879                                        NULL,
1880                                        NULL,
1881                                        NULL,
1882                                        MatMissingDiagonal_SeqSELL,
1883                                        /*114*/ NULL,
1884                                        NULL,
1885                                        NULL,
1886                                        NULL,
1887                                        NULL,
1888                                        /*119*/ NULL,
1889                                        NULL,
1890                                        NULL,
1891                                        NULL,
1892                                        NULL,
1893                                        /*124*/ NULL,
1894                                        NULL,
1895                                        NULL,
1896                                        NULL,
1897                                        NULL,
1898                                        /*129*/ NULL,
1899                                        NULL,
1900                                        NULL,
1901                                        NULL,
1902                                        NULL,
1903                                        /*134*/ NULL,
1904                                        NULL,
1905                                        NULL,
1906                                        NULL,
1907                                        NULL,
1908                                        /*139*/ NULL,
1909                                        NULL,
1910                                        NULL,
1911                                        MatFDColoringSetUp_SeqXAIJ,
1912                                        NULL,
1913                                        /*144*/ NULL,
1914                                        NULL,
1915                                        NULL,
1916                                        NULL,
1917                                        NULL,
1918                                        NULL,
1919                                        /*150*/ NULL,
1920                                        NULL};
1921 
1922 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1923 {
1924   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1925 
1926   PetscFunctionBegin;
1927   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1928 
1929   /* allocate space for values if not already there */
1930   if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1931 
1932   /* copy values over */
1933   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1934   PetscFunctionReturn(PETSC_SUCCESS);
1935 }
1936 
1937 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1938 {
1939   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1940 
1941   PetscFunctionBegin;
1942   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1943   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1944   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1945   PetscFunctionReturn(PETSC_SUCCESS);
1946 }
1947 
1948 PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio)
1949 {
1950   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1951 
1952   PetscFunctionBegin;
1953   if (a->totalslices && a->sliidx[a->totalslices]) {
1954     *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices];
1955   } else {
1956     *ratio = 0.0;
1957   }
1958   PetscFunctionReturn(PETSC_SUCCESS);
1959 }
1960 
1961 PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth)
1962 {
1963   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1964   PetscInt     i, current_slicewidth;
1965 
1966   PetscFunctionBegin;
1967   *slicewidth = 0;
1968   for (i = 0; i < a->totalslices; i++) {
1969     current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight;
1970     if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth;
1971   }
1972   PetscFunctionReturn(PETSC_SUCCESS);
1973 }
1974 
1975 PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth)
1976 {
1977   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1978 
1979   PetscFunctionBegin;
1980   *slicewidth = 0;
1981   if (a->totalslices) { *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; }
1982   PetscFunctionReturn(PETSC_SUCCESS);
1983 }
1984 
1985 PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight)
1986 {
1987   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1988 
1989   PetscFunctionBegin;
1990   if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS);
1991   PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight);
1992   a->sliceheight = sliceheight;
1993   PetscFunctionReturn(PETSC_SUCCESS);
1994 }
1995 
1996 /*@C
1997   MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()`
1998 
1999   Not Collective
2000 
2001   Input Parameters:
2002 +  A - a `MATSEQSELL` matrix
2003 -  array - pointer to the data
2004 
2005   Level: intermediate
2006 
2007 .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
2008 @*/
2009 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array)
2010 {
2011   PetscFunctionBegin;
2012   PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
2013   PetscFunctionReturn(PETSC_SUCCESS);
2014 }
2015 
2016 /*@C
2017   MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix.
2018 
2019   Not Collective
2020 
2021   Input Parameter:
2022 .  A - a MATSEQSELL matrix
2023 
2024   Output Parameter:
2025 .  ratio - ratio of number of padded zeros to number of allocated elements
2026 
2027   Level: intermediate
2028 @*/
2029 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio)
2030 {
2031   PetscFunctionBegin;
2032   PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio));
2033   PetscFunctionReturn(PETSC_SUCCESS);
2034 }
2035 
2036 /*@C
2037   MatSeqSELLGetMaxSliceWidth - returns the maximum slice width.
2038 
2039   Not Collective
2040 
2041   Input Parameter:
2042 .  A - a MATSEQSELL matrix
2043 
2044   Output Parameter:
2045 .  slicewidth - maximum slice width
2046 
2047  Level: intermediate
2048 @*/
2049 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth)
2050 {
2051   PetscFunctionBegin;
2052   PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth));
2053   PetscFunctionReturn(PETSC_SUCCESS);
2054 }
2055 
2056 /*@C
2057   MatSeqSELLGetAvgSliceWidth - returns the average slice width.
2058 
2059   Not Collective
2060 
2061   Input Parameter:
2062 .  A - a MATSEQSELL matrix
2063 
2064   Output Parameter:
2065 .  slicewidth - average slice width
2066 
2067   Level: intermediate
2068 @*/
2069 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth)
2070 {
2071   PetscFunctionBegin;
2072   PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth));
2073   PetscFunctionReturn(PETSC_SUCCESS);
2074 }
2075 
2076 /*@C
2077   MatSeqSELLSetSliceHeight - sets the slice height.
2078 
2079   Not Collective
2080 
2081   Input Parameters:
2082 +  A - a MATSEQSELL matrix
2083 -  sliceheight - slice height
2084 
2085   Notes:
2086   You cannot change the slice height once it have been set.
2087 
2088   The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called.
2089 
2090   Level: intermediate
2091 @*/
2092 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight)
2093 {
2094   PetscFunctionBegin;
2095   PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight));
2096   PetscFunctionReturn(PETSC_SUCCESS);
2097 }
2098 
2099 /*@C
2100   MatSeqSELLGetVarSliceSize - returns the variance of the slice size.
2101 
2102   Not Collective
2103 
2104   Input Parameter:
2105 .  A - a MATSEQSELL matrix
2106 
2107   Output Parameter:
2108 .  variance - variance of the slice size
2109 
2110   Level: intermediate
2111 @*/
2112 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance)
2113 {
2114   PetscFunctionBegin;
2115   PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance));
2116   PetscFunctionReturn(PETSC_SUCCESS);
2117 }
2118 
2119 #if defined(PETSC_HAVE_CUDA)
2120 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
2121 #endif
2122 
2123 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
2124 {
2125   Mat_SeqSELL *b;
2126   PetscMPIInt  size;
2127 
2128   PetscFunctionBegin;
2129   PetscCall(PetscCitationsRegister(citation, &cited));
2130   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2131   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
2132 
2133   PetscCall(PetscNew(&b));
2134 
2135   B->data = (void *)b;
2136 
2137   PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps)));
2138 
2139   b->row                = NULL;
2140   b->col                = NULL;
2141   b->icol               = NULL;
2142   b->reallocs           = 0;
2143   b->ignorezeroentries  = PETSC_FALSE;
2144   b->roworiented        = PETSC_TRUE;
2145   b->nonew              = 0;
2146   b->diag               = NULL;
2147   b->solve_work         = NULL;
2148   B->spptr              = NULL;
2149   b->saved_values       = NULL;
2150   b->idiag              = NULL;
2151   b->mdiag              = NULL;
2152   b->ssor_work          = NULL;
2153   b->omega              = 1.0;
2154   b->fshift             = 0.0;
2155   b->idiagvalid         = PETSC_FALSE;
2156   b->keepnonzeropattern = PETSC_FALSE;
2157   b->sliceheight        = 0;
2158 
2159   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
2160   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
2161   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
2162   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
2163   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
2164   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
2165   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_SeqSELL_SeqAIJ_C", MatConvert_SeqSELL_SeqAIJ));
2166 #if defined(PETSC_HAVE_CUDA)
2167   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_SeqSELL_SeqSELLCUDA_C", MatConvert_SeqSELL_SeqSELLCUDA));
2168 #endif
2169   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL));
2170   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL));
2171   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL));
2172   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL));
2173 
2174   PetscObjectOptionsBegin((PetscObject)B);
2175   {
2176     PetscInt  newsh = -1;
2177     PetscBool flg;
2178 
2179     PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg));
2180     if (flg) { PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); }
2181   }
2182   PetscOptionsEnd();
2183   PetscFunctionReturn(PETSC_SUCCESS);
2184 }
2185 
2186 /*
2187  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2188  */
2189 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
2190 {
2191   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
2192   PetscInt     i, m                           = A->rmap->n;
2193   PetscInt     totalslices = a->totalslices;
2194 
2195   PetscFunctionBegin;
2196   C->factortype = A->factortype;
2197   c->row        = NULL;
2198   c->col        = NULL;
2199   c->icol       = NULL;
2200   c->reallocs   = 0;
2201   C->assembled  = PETSC_TRUE;
2202 
2203   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2204   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2205 
2206   PetscCall(PetscMalloc1(a->sliceheight * totalslices, &c->rlen));
2207   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2208 
2209   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2210   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2211 
2212   /* allocate the matrix space */
2213   if (mallocmatspace) {
2214     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2215 
2216     c->singlemalloc = PETSC_TRUE;
2217 
2218     if (m > 0) {
2219       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2220       if (cpvalues == MAT_COPY_VALUES) {
2221         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2222       } else {
2223         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2224       }
2225     }
2226   }
2227 
2228   c->ignorezeroentries = a->ignorezeroentries;
2229   c->roworiented       = a->roworiented;
2230   c->nonew             = a->nonew;
2231   if (a->diag) {
2232     PetscCall(PetscMalloc1(m, &c->diag));
2233     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2234   } else c->diag = NULL;
2235 
2236   c->solve_work         = NULL;
2237   c->saved_values       = NULL;
2238   c->idiag              = NULL;
2239   c->ssor_work          = NULL;
2240   c->keepnonzeropattern = a->keepnonzeropattern;
2241   c->free_val           = PETSC_TRUE;
2242   c->free_colidx        = PETSC_TRUE;
2243 
2244   c->maxallocmat  = a->maxallocmat;
2245   c->maxallocrow  = a->maxallocrow;
2246   c->rlenmax      = a->rlenmax;
2247   c->nz           = a->nz;
2248   C->preallocated = PETSC_TRUE;
2249 
2250   c->nonzerorowcnt = a->nonzerorowcnt;
2251   C->nonzerostate  = A->nonzerostate;
2252 
2253   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2254   PetscFunctionReturn(PETSC_SUCCESS);
2255 }
2256 
2257 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2258 {
2259   PetscFunctionBegin;
2260   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2261   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2262   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2263   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2264   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2265   PetscFunctionReturn(PETSC_SUCCESS);
2266 }
2267 
2268 /*MC
2269    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2270    based on the sliced Ellpack format
2271 
2272    Options Database Key:
2273 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2274 
2275    Level: beginner
2276 
2277 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2278 M*/
2279 
2280 /*MC
2281    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2282 
2283    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2284    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2285   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2286   for communicators controlling multiple processes.  It is recommended that you call both of
2287   the above preallocation routines for simplicity.
2288 
2289    Options Database Key:
2290 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2291 
2292   Level: beginner
2293 
2294   Notes:
2295    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2296 
2297    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2298    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2299 
2300   Developer Notes:
2301    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2302 
2303    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2304 .vb
2305                             (2 0  3 4)
2306    Consider the matrix A =  (5 0  6 0)
2307                             (0 0  7 8)
2308                             (0 0  9 9)
2309 
2310    symbolically the Ellpack format can be written as
2311 
2312         (2 3 4 |)           (0 2 3 |)
2313    v =  (5 6 0 |)  colidx = (0 2 2 |)
2314         --------            ---------
2315         (7 8 |)             (2 3 |)
2316         (9 9 |)             (2 3 |)
2317 
2318     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2319     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2320     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2321 
2322     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2323 
2324 .ve
2325 
2326       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2327 
2328  References:
2329 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2330    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2331 
2332 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2333 M*/
2334 
2335 /*@C
2336        MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2337 
2338  Collective
2339 
2340  Input Parameters:
2341 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
2342 .  m - number of rows
2343 .  n - number of columns
2344 .  rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2345 -  rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2346 
2347  Output Parameter:
2348 .  A - the matrix
2349 
2350  Level: intermediate
2351 
2352  Notes:
2353  It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2354  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2355  [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2356 
2357  Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2358  Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2359  allocation.
2360 
2361  .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2362  @*/
2363 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2364 {
2365   PetscFunctionBegin;
2366   PetscCall(MatCreate(comm, A));
2367   PetscCall(MatSetSizes(*A, m, n, m, n));
2368   PetscCall(MatSetType(*A, MATSEQSELL));
2369   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2370   PetscFunctionReturn(PETSC_SUCCESS);
2371 }
2372 
2373 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2374 {
2375   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2376   PetscInt     totalslices = a->totalslices;
2377 
2378   PetscFunctionBegin;
2379   /* If the  matrix dimensions are not equal,or no of nonzeros */
2380   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2381     *flg = PETSC_FALSE;
2382     PetscFunctionReturn(PETSC_SUCCESS);
2383   }
2384   /* if the a->colidx are the same */
2385   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2386   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2387   /* if a->val are the same */
2388   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2389   PetscFunctionReturn(PETSC_SUCCESS);
2390 }
2391 
2392 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2393 {
2394   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2395 
2396   PetscFunctionBegin;
2397   a->idiagvalid = PETSC_FALSE;
2398   PetscFunctionReturn(PETSC_SUCCESS);
2399 }
2400 
2401 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2402 {
2403 #if defined(PETSC_USE_COMPLEX)
2404   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2405   PetscInt     i;
2406   PetscScalar *val = a->val;
2407 
2408   PetscFunctionBegin;
2409   for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); }
2410   #if defined(PETSC_HAVE_CUDA)
2411   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
2412   #endif
2413 #else
2414   PetscFunctionBegin;
2415 #endif
2416   PetscFunctionReturn(PETSC_SUCCESS);
2417 }
2418