xref: /petsc/src/mat/impls/sell/seq/sell.c (revision 2d1451d43b73a0495cd81c074cbc1e0206888947)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited      = PETSC_FALSE;
10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14                                " year = 2018\n"
15                                "}\n";
16 
17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
18 
19   #include <immintrin.h>
20 
21   #if !defined(_MM_SCALE_8)
22     #define _MM_SCALE_8 8
23   #endif
24 
25   #if defined(__AVX512F__)
26     /* these do not work
27    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29   */
30     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
33       vec_vals = _mm512_loadu_pd(aval); \
34       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36   #elif defined(__AVX2__) && defined(__FMA__)
37     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38       vec_vals = _mm256_loadu_pd(aval); \
39       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42   #endif
43 #endif /* PETSC_HAVE_IMMINTRIN_H */
44 
45 /*@C
46  MatSeqSELLSetPreallocation - For good matrix assembly performance
47  the user should preallocate the matrix storage by setting the parameter `nz`
48  (or the array `nnz`).
49 
50  Collective
51 
52  Input Parameters:
53 +  B - The `MATSEQSELL` matrix
54 .  rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided
55 -  rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
56 
57  Level: intermediate
58 
59  Notes:
60  Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
61  Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
62  allocation.
63 
64  You can call `MatGetInfo()` to get information on how effective the preallocation was;
65  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
66  You can also run with the option `-info` and look for messages with the string
67  malloc in them to see if additional memory allocation was needed.
68 
69  Developer Note:
70  Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
71  entries or columns indices.
72 
73  The maximum number of nonzeos in any row should be as accurate as possible.
74  If it is underestimated, you will get bad performance due to reallocation
75  (`MatSeqXSELLReallocateSELL()`).
76 
77  .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
78  @*/
79 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
80 {
81   PetscFunctionBegin;
82   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
83   PetscValidType(B, 1);
84   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
85   PetscFunctionReturn(PETSC_SUCCESS);
86 }
87 
88 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
89 {
90   Mat_SeqSELL *b;
91   PetscInt     i, j, totalslices;
92   PetscBool    skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
93 
94   PetscFunctionBegin;
95   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
96   if (maxallocrow == MAT_SKIP_ALLOCATION) {
97     skipallocation = PETSC_TRUE;
98     maxallocrow    = 0;
99   }
100 
101   PetscCall(PetscLayoutSetUp(B->rmap));
102   PetscCall(PetscLayoutSetUp(B->cmap));
103 
104   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
105   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
106   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
107   if (rlen) {
108     for (i = 0; i < B->rmap->n; i++) {
109       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
110       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
111     }
112   }
113 
114   B->preallocated = PETSC_TRUE;
115 
116   b = (Mat_SeqSELL *)B->data;
117 
118   totalslices    = PetscCeilInt(B->rmap->n, SLICE_HEIGHT);
119   b->totalslices = totalslices;
120   if (!skipallocation) {
121     if (B->rmap->n % SLICE_HEIGHT) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n));
122 
123     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
124       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
125     }
126     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
127       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
128       else if (maxallocrow < 0) maxallocrow = 1;
129       for (i = 0; i <= totalslices; i++) b->sliidx[i] = SLICE_HEIGHT * i * maxallocrow;
130     } else {
131       maxallocrow  = 0;
132       b->sliidx[0] = 0;
133       for (i = 1; i < totalslices; i++) {
134         b->sliidx[i] = 0;
135         for (j = 0; j < SLICE_HEIGHT; j++) { b->sliidx[i] = PetscMax(b->sliidx[i], rlen[SLICE_HEIGHT * (i - 1) + j]); }
136         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
137         PetscCall(PetscIntSumError(b->sliidx[i - 1], SLICE_HEIGHT * b->sliidx[i], &b->sliidx[i]));
138       }
139       /* last slice */
140       b->sliidx[totalslices] = 0;
141       for (j = SLICE_HEIGHT * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
142       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
143       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + SLICE_HEIGHT * b->sliidx[totalslices];
144     }
145 
146     /* allocate space for val, colidx, rlen */
147     /* FIXME: should B's old memory be unlogged? */
148     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
149     /* FIXME: assuming an element of the bit array takes 8 bits */
150     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
151     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
152     PetscCall(PetscCalloc1(SLICE_HEIGHT * totalslices, &b->rlen));
153 
154     b->singlemalloc = PETSC_TRUE;
155     b->free_val     = PETSC_TRUE;
156     b->free_colidx  = PETSC_TRUE;
157   } else {
158     b->free_val    = PETSC_FALSE;
159     b->free_colidx = PETSC_FALSE;
160   }
161 
162   b->nz               = 0;
163   b->maxallocrow      = maxallocrow;
164   b->rlenmax          = maxallocrow;
165   b->maxallocmat      = b->sliidx[totalslices];
166   B->info.nz_unneeded = (double)b->maxallocmat;
167   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
168   PetscFunctionReturn(PETSC_SUCCESS);
169 }
170 
171 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
172 {
173   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
174   PetscInt     shift;
175 
176   PetscFunctionBegin;
177   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
178   if (nz) *nz = a->rlen[row];
179   shift = a->sliidx[row / SLICE_HEIGHT] + (row % SLICE_HEIGHT);
180   if (!a->getrowcols) { PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); }
181   if (idx) {
182     PetscInt j;
183     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + SLICE_HEIGHT * j];
184     *idx = a->getrowcols;
185   }
186   if (v) {
187     PetscInt j;
188     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + SLICE_HEIGHT * j];
189     *v = a->getrowvals;
190   }
191   PetscFunctionReturn(PETSC_SUCCESS);
192 }
193 
194 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
195 {
196   PetscFunctionBegin;
197   PetscFunctionReturn(PETSC_SUCCESS);
198 }
199 
200 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
201 {
202   Mat          B;
203   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
204   PetscInt     i;
205 
206   PetscFunctionBegin;
207   if (reuse == MAT_REUSE_MATRIX) {
208     B = *newmat;
209     PetscCall(MatZeroEntries(B));
210   } else {
211     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
212     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
213     PetscCall(MatSetType(B, MATSEQAIJ));
214     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
215   }
216 
217   for (i = 0; i < A->rmap->n; i++) {
218     PetscInt     nz = 0, *cols = NULL;
219     PetscScalar *vals = NULL;
220 
221     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
222     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
223     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
224   }
225 
226   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
227   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
228   B->rmap->bs = A->rmap->bs;
229 
230   if (reuse == MAT_INPLACE_MATRIX) {
231     PetscCall(MatHeaderReplace(A, &B));
232   } else {
233     *newmat = B;
234   }
235   PetscFunctionReturn(PETSC_SUCCESS);
236 }
237 
238 #include <../src/mat/impls/aij/seq/aij.h>
239 
240 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
241 {
242   Mat                B;
243   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
244   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
245   const PetscInt    *cols;
246   const PetscScalar *vals;
247 
248   PetscFunctionBegin;
249 
250   if (reuse == MAT_REUSE_MATRIX) {
251     B = *newmat;
252   } else {
253     if (PetscDefined(USE_DEBUG) || !a->ilen) {
254       PetscCall(PetscMalloc1(m, &rowlengths));
255       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
256     }
257     if (PetscDefined(USE_DEBUG) && a->ilen) {
258       PetscBool eq;
259       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
260       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
261       PetscCall(PetscFree(rowlengths));
262       rowlengths = a->ilen;
263     } else if (a->ilen) rowlengths = a->ilen;
264     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
265     PetscCall(MatSetSizes(B, m, n, m, n));
266     PetscCall(MatSetType(B, MATSEQSELL));
267     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
268     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
269   }
270 
271   for (row = 0; row < m; row++) {
272     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
273     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
274     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
275   }
276   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
277   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
278   B->rmap->bs = A->rmap->bs;
279 
280   if (reuse == MAT_INPLACE_MATRIX) {
281     PetscCall(MatHeaderReplace(A, &B));
282   } else {
283     *newmat = B;
284   }
285   PetscFunctionReturn(PETSC_SUCCESS);
286 }
287 
288 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
289 {
290   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
291   PetscScalar       *y;
292   const PetscScalar *x;
293   const MatScalar   *aval        = a->val;
294   PetscInt           totalslices = a->totalslices;
295   const PetscInt    *acolidx     = a->colidx;
296   PetscInt           i, j;
297 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
298   __m512d  vec_x, vec_y, vec_vals;
299   __m256i  vec_idx;
300   __mmask8 mask;
301   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
302   __m256i  vec_idx2, vec_idx3, vec_idx4;
303 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
304   __m128i   vec_idx;
305   __m256d   vec_x, vec_y, vec_y2, vec_vals;
306   MatScalar yval;
307   PetscInt  r, rows_left, row, nnz_in_row;
308 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
309   __m128d   vec_x_tmp;
310   __m256d   vec_x, vec_y, vec_y2, vec_vals;
311   MatScalar yval;
312   PetscInt  r, rows_left, row, nnz_in_row;
313 #else
314   PetscInt    k;
315   PetscScalar sum[SLICE_HEIGHT];
316 #endif
317 
318 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
319   #pragma disjoint(*x, *y, *aval)
320 #endif
321 
322   PetscFunctionBegin;
323   PetscCall(VecGetArrayRead(xx, &x));
324   PetscCall(VecGetArray(yy, &y));
325 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
326   for (i = 0; i < totalslices; i++) { /* loop over slices */
327     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
328     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
329 
330     vec_y  = _mm512_setzero_pd();
331     vec_y2 = _mm512_setzero_pd();
332     vec_y3 = _mm512_setzero_pd();
333     vec_y4 = _mm512_setzero_pd();
334 
335     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
336     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
337     case 3:
338       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
339       acolidx += 8;
340       aval += 8;
341       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
342       acolidx += 8;
343       aval += 8;
344       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
345       acolidx += 8;
346       aval += 8;
347       j += 3;
348       break;
349     case 2:
350       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
351       acolidx += 8;
352       aval += 8;
353       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
354       acolidx += 8;
355       aval += 8;
356       j += 2;
357       break;
358     case 1:
359       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
360       acolidx += 8;
361       aval += 8;
362       j += 1;
363       break;
364     }
365   #pragma novector
366     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
367       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
368       acolidx += 8;
369       aval += 8;
370       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
371       acolidx += 8;
372       aval += 8;
373       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
374       acolidx += 8;
375       aval += 8;
376       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
377       acolidx += 8;
378       aval += 8;
379     }
380 
381     vec_y = _mm512_add_pd(vec_y, vec_y2);
382     vec_y = _mm512_add_pd(vec_y, vec_y3);
383     vec_y = _mm512_add_pd(vec_y, vec_y4);
384     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
385       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
386       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
387     } else {
388       _mm512_storeu_pd(&y[8 * i], vec_y);
389     }
390   }
391 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
392   for (i = 0; i < totalslices; i++) { /* loop over full slices */
393     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
394     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
395 
396     /* last slice may have padding rows. Don't use vectorization. */
397     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
398       rows_left = A->rmap->n - 8 * i;
399       for (r = 0; r < rows_left; ++r) {
400         yval       = (MatScalar)0;
401         row        = 8 * i + r;
402         nnz_in_row = a->rlen[row];
403         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
404         y[row] = yval;
405       }
406       break;
407     }
408 
409     vec_y  = _mm256_setzero_pd();
410     vec_y2 = _mm256_setzero_pd();
411 
412   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
413   #pragma novector
414   #pragma unroll(2)
415     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
416       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
417       aval += 4;
418       acolidx += 4;
419       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
420       aval += 4;
421       acolidx += 4;
422     }
423 
424     _mm256_storeu_pd(y + i * 8, vec_y);
425     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
426   }
427 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
428   for (i = 0; i < totalslices; i++) { /* loop over full slices */
429     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
430     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
431 
432     vec_y  = _mm256_setzero_pd();
433     vec_y2 = _mm256_setzero_pd();
434 
435     /* last slice may have padding rows. Don't use vectorization. */
436     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
437       rows_left = A->rmap->n - 8 * i;
438       for (r = 0; r < rows_left; ++r) {
439         yval       = (MatScalar)0;
440         row        = 8 * i + r;
441         nnz_in_row = a->rlen[row];
442         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
443         y[row] = yval;
444       }
445       break;
446     }
447 
448   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
449   #pragma novector
450   #pragma unroll(2)
451     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
452       vec_vals  = _mm256_loadu_pd(aval);
453       vec_x_tmp = _mm_setzero_pd();
454       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
455       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
456       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
457       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
458       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
459       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
460       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
461       aval += 4;
462 
463       vec_vals  = _mm256_loadu_pd(aval);
464       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
465       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
466       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
467       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
468       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
469       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
470       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
471       aval += 4;
472     }
473 
474     _mm256_storeu_pd(y + i * 8, vec_y);
475     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
476   }
477 #else
478   for (i = 0; i < totalslices; i++) { /* loop over slices */
479     for (j = 0; j < SLICE_HEIGHT; j++) {
480       sum[j] = 0.0;
481       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += SLICE_HEIGHT) sum[j] += aval[k] * x[acolidx[k]];
482     }
483     if (i == totalslices - 1 && (A->rmap->n % SLICE_HEIGHT)) { /* if last slice has padding rows */
484       for (j = 0; j < (A->rmap->n % SLICE_HEIGHT); j++) y[SLICE_HEIGHT * i + j] = sum[j];
485     } else {
486       for (j = 0; j < SLICE_HEIGHT; j++) y[SLICE_HEIGHT * i + j] = sum[j];
487     }
488   }
489 #endif
490 
491   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
492   PetscCall(VecRestoreArrayRead(xx, &x));
493   PetscCall(VecRestoreArray(yy, &y));
494   PetscFunctionReturn(PETSC_SUCCESS);
495 }
496 
497 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
498 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
499 {
500   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
501   PetscScalar       *y, *z;
502   const PetscScalar *x;
503   const MatScalar   *aval        = a->val;
504   PetscInt           totalslices = a->totalslices;
505   const PetscInt    *acolidx     = a->colidx;
506   PetscInt           i, j;
507 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
508   __m512d  vec_x, vec_y, vec_vals;
509   __m256i  vec_idx;
510   __mmask8 mask;
511   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
512   __m256i  vec_idx2, vec_idx3, vec_idx4;
513 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
514   __m128d   vec_x_tmp;
515   __m256d   vec_x, vec_y, vec_y2, vec_vals;
516   MatScalar yval;
517   PetscInt  r, row, nnz_in_row;
518 #else
519   PetscInt    k;
520   PetscScalar sum[SLICE_HEIGHT];
521 #endif
522 
523 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
524   #pragma disjoint(*x, *y, *aval)
525 #endif
526 
527   PetscFunctionBegin;
528   if (!a->nz) {
529     PetscCall(VecCopy(yy, zz));
530     PetscFunctionReturn(PETSC_SUCCESS);
531   }
532   PetscCall(VecGetArrayRead(xx, &x));
533   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
534 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
535   for (i = 0; i < totalslices; i++) { /* loop over slices */
536     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
537     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
538 
539     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
540       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
541       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
542     } else {
543       vec_y = _mm512_loadu_pd(&y[8 * i]);
544     }
545     vec_y2 = _mm512_setzero_pd();
546     vec_y3 = _mm512_setzero_pd();
547     vec_y4 = _mm512_setzero_pd();
548 
549     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */
550     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
551     case 3:
552       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
553       acolidx += 8;
554       aval += 8;
555       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
556       acolidx += 8;
557       aval += 8;
558       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
559       acolidx += 8;
560       aval += 8;
561       j += 3;
562       break;
563     case 2:
564       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
565       acolidx += 8;
566       aval += 8;
567       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
568       acolidx += 8;
569       aval += 8;
570       j += 2;
571       break;
572     case 1:
573       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
574       acolidx += 8;
575       aval += 8;
576       j += 1;
577       break;
578     }
579   #pragma novector
580     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
581       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
582       acolidx += 8;
583       aval += 8;
584       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
585       acolidx += 8;
586       aval += 8;
587       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
588       acolidx += 8;
589       aval += 8;
590       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
591       acolidx += 8;
592       aval += 8;
593     }
594 
595     vec_y = _mm512_add_pd(vec_y, vec_y2);
596     vec_y = _mm512_add_pd(vec_y, vec_y3);
597     vec_y = _mm512_add_pd(vec_y, vec_y4);
598     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
599       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
600     } else {
601       _mm512_storeu_pd(&z[8 * i], vec_y);
602     }
603   }
604 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
605   for (i = 0; i < totalslices; i++) { /* loop over full slices */
606     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
607     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
608 
609     /* last slice may have padding rows. Don't use vectorization. */
610     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
611       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
612         row        = 8 * i + r;
613         yval       = (MatScalar)0.0;
614         nnz_in_row = a->rlen[row];
615         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
616         z[row] = y[row] + yval;
617       }
618       break;
619     }
620 
621     vec_y  = _mm256_loadu_pd(y + 8 * i);
622     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
623 
624     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
625     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
626       vec_vals  = _mm256_loadu_pd(aval);
627       vec_x_tmp = _mm_setzero_pd();
628       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
629       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
630       vec_x     = _mm256_setzero_pd();
631       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
632       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
633       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
634       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
635       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
636       aval += 4;
637 
638       vec_vals  = _mm256_loadu_pd(aval);
639       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
640       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
641       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
642       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
643       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
644       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
645       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
646       aval += 4;
647     }
648 
649     _mm256_storeu_pd(z + i * 8, vec_y);
650     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
651   }
652 #else
653   for (i = 0; i < totalslices; i++) { /* loop over slices */
654     for (j = 0; j < SLICE_HEIGHT; j++) {
655       sum[j] = 0.0;
656       for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += SLICE_HEIGHT) sum[j] += aval[k] * x[acolidx[k]];
657     }
658     if (i == totalslices - 1 && (A->rmap->n % SLICE_HEIGHT)) {
659       for (j = 0; j < (A->rmap->n % SLICE_HEIGHT); j++) z[SLICE_HEIGHT * i + j] = y[SLICE_HEIGHT * i + j] + sum[j];
660     } else {
661       for (j = 0; j < SLICE_HEIGHT; j++) z[SLICE_HEIGHT * i + j] = y[SLICE_HEIGHT * i + j] + sum[j];
662     }
663   }
664 #endif
665 
666   PetscCall(PetscLogFlops(2.0 * a->nz));
667   PetscCall(VecRestoreArrayRead(xx, &x));
668   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
669   PetscFunctionReturn(PETSC_SUCCESS);
670 }
671 
672 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
673 {
674   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
675   PetscScalar       *y;
676   const PetscScalar *x;
677   const MatScalar   *aval    = a->val;
678   const PetscInt    *acolidx = a->colidx;
679   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices;
680 
681 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
682   #pragma disjoint(*x, *y, *aval)
683 #endif
684 
685   PetscFunctionBegin;
686   if (A->symmetric == PETSC_BOOL3_TRUE) {
687     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
688     PetscFunctionReturn(PETSC_SUCCESS);
689   }
690   if (zz != yy) PetscCall(VecCopy(zz, yy));
691 
692   if (a->nz) {
693     PetscCall(VecGetArrayRead(xx, &x));
694     PetscCall(VecGetArray(yy, &y));
695     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
696       if (i == totalslices - 1 && (A->rmap->n % SLICE_HEIGHT)) {
697         for (r = 0; r < (A->rmap->n % SLICE_HEIGHT); ++r) {
698           row        = SLICE_HEIGHT * i + r;
699           nnz_in_row = a->rlen[row];
700           for (j = 0; j < nnz_in_row; ++j) y[acolidx[SLICE_HEIGHT * j + r]] += aval[SLICE_HEIGHT * j + r] * x[row];
701         }
702         break;
703       }
704       for (r = 0; r < SLICE_HEIGHT; ++r)
705         for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += SLICE_HEIGHT) y[acolidx[j]] += aval[j] * x[SLICE_HEIGHT * i + r];
706     }
707     PetscCall(PetscLogFlops(2.0 * a->nz));
708     PetscCall(VecRestoreArrayRead(xx, &x));
709     PetscCall(VecRestoreArray(yy, &y));
710   }
711   PetscFunctionReturn(PETSC_SUCCESS);
712 }
713 
714 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
715 {
716   PetscFunctionBegin;
717   if (A->symmetric == PETSC_BOOL3_TRUE) {
718     PetscCall(MatMult_SeqSELL(A, xx, yy));
719   } else {
720     PetscCall(VecSet(yy, 0.0));
721     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
722   }
723   PetscFunctionReturn(PETSC_SUCCESS);
724 }
725 
726 /*
727      Checks for missing diagonals
728 */
729 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
730 {
731   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
732   PetscInt    *diag, i;
733 
734   PetscFunctionBegin;
735   *missing = PETSC_FALSE;
736   if (A->rmap->n > 0 && !(a->colidx)) {
737     *missing = PETSC_TRUE;
738     if (d) *d = 0;
739     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
740   } else {
741     diag = a->diag;
742     for (i = 0; i < A->rmap->n; i++) {
743       if (diag[i] == -1) {
744         *missing = PETSC_TRUE;
745         if (d) *d = i;
746         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
747         break;
748       }
749     }
750   }
751   PetscFunctionReturn(PETSC_SUCCESS);
752 }
753 
754 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
755 {
756   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
757   PetscInt     i, j, m = A->rmap->n, shift;
758 
759   PetscFunctionBegin;
760   if (!a->diag) {
761     PetscCall(PetscMalloc1(m, &a->diag));
762     a->free_diag = PETSC_TRUE;
763   }
764   for (i = 0; i < m; i++) {                                      /* loop over rows */
765     shift      = a->sliidx[i / SLICE_HEIGHT] + i % SLICE_HEIGHT; /* starting index of the row i */
766     a->diag[i] = -1;
767     for (j = 0; j < a->rlen[i]; j++) {
768       if (a->colidx[shift + SLICE_HEIGHT * j] == i) {
769         a->diag[i] = shift + SLICE_HEIGHT * j;
770         break;
771       }
772     }
773   }
774   PetscFunctionReturn(PETSC_SUCCESS);
775 }
776 
777 /*
778   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
779 */
780 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
781 {
782   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
783   PetscInt     i, *diag, m = A->rmap->n;
784   MatScalar   *val = a->val;
785   PetscScalar *idiag, *mdiag;
786 
787   PetscFunctionBegin;
788   if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS);
789   PetscCall(MatMarkDiagonal_SeqSELL(A));
790   diag = a->diag;
791   if (!a->idiag) {
792     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
793     val = a->val;
794   }
795   mdiag = a->mdiag;
796   idiag = a->idiag;
797 
798   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
799     for (i = 0; i < m; i++) {
800       mdiag[i] = val[diag[i]];
801       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
802         PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
803         PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
804         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
805         A->factorerror_zeropivot_value = 0.0;
806         A->factorerror_zeropivot_row   = i;
807       }
808       idiag[i] = 1.0 / val[diag[i]];
809     }
810     PetscCall(PetscLogFlops(m));
811   } else {
812     for (i = 0; i < m; i++) {
813       mdiag[i] = val[diag[i]];
814       idiag[i] = omega / (fshift + val[diag[i]]);
815     }
816     PetscCall(PetscLogFlops(2.0 * m));
817   }
818   a->idiagvalid = PETSC_TRUE;
819   PetscFunctionReturn(PETSC_SUCCESS);
820 }
821 
822 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
823 {
824   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
825 
826   PetscFunctionBegin;
827   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
828   PetscCall(MatSeqSELLInvalidateDiagonal(A));
829   PetscFunctionReturn(PETSC_SUCCESS);
830 }
831 
832 PetscErrorCode MatDestroy_SeqSELL(Mat A)
833 {
834   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
835 
836   PetscFunctionBegin;
837 #if defined(PETSC_USE_LOG)
838   PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz));
839 #endif
840   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
841   PetscCall(ISDestroy(&a->row));
842   PetscCall(ISDestroy(&a->col));
843   PetscCall(PetscFree(a->diag));
844   PetscCall(PetscFree(a->rlen));
845   PetscCall(PetscFree(a->sliidx));
846   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
847   PetscCall(PetscFree(a->solve_work));
848   PetscCall(ISDestroy(&a->icol));
849   PetscCall(PetscFree(a->saved_values));
850   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
851 
852   PetscCall(PetscFree(A->data));
853 
854   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
855   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
856   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
857   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
858   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
859   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
860   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_SeqSELL_SeqAIJ_C", NULL));
861 #if defined(PETSC_HAVE_CUDA)
862   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_SeqSELL_SeqSELLCUDA_C", NULL));
863 #endif
864   PetscFunctionReturn(PETSC_SUCCESS);
865 }
866 
867 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
868 {
869   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
870 
871   PetscFunctionBegin;
872   switch (op) {
873   case MAT_ROW_ORIENTED:
874     a->roworiented = flg;
875     break;
876   case MAT_KEEP_NONZERO_PATTERN:
877     a->keepnonzeropattern = flg;
878     break;
879   case MAT_NEW_NONZERO_LOCATIONS:
880     a->nonew = (flg ? 0 : 1);
881     break;
882   case MAT_NEW_NONZERO_LOCATION_ERR:
883     a->nonew = (flg ? -1 : 0);
884     break;
885   case MAT_NEW_NONZERO_ALLOCATION_ERR:
886     a->nonew = (flg ? -2 : 0);
887     break;
888   case MAT_UNUSED_NONZERO_LOCATION_ERR:
889     a->nounused = (flg ? -1 : 0);
890     break;
891   case MAT_FORCE_DIAGONAL_ENTRIES:
892   case MAT_IGNORE_OFF_PROC_ENTRIES:
893   case MAT_USE_HASH_TABLE:
894   case MAT_SORTED_FULL:
895     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
896     break;
897   case MAT_SPD:
898   case MAT_SYMMETRIC:
899   case MAT_STRUCTURALLY_SYMMETRIC:
900   case MAT_HERMITIAN:
901   case MAT_SYMMETRY_ETERNAL:
902   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
903   case MAT_SPD_ETERNAL:
904     /* These options are handled directly by MatSetOption() */
905     break;
906   default:
907     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
908   }
909   PetscFunctionReturn(PETSC_SUCCESS);
910 }
911 
912 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
913 {
914   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
915   PetscInt     i, j, n, shift;
916   PetscScalar *x, zero = 0.0;
917 
918   PetscFunctionBegin;
919   PetscCall(VecGetLocalSize(v, &n));
920   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
921 
922   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
923     PetscInt *diag = a->diag;
924     PetscCall(VecGetArray(v, &x));
925     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
926     PetscCall(VecRestoreArray(v, &x));
927     PetscFunctionReturn(PETSC_SUCCESS);
928   }
929 
930   PetscCall(VecSet(v, zero));
931   PetscCall(VecGetArray(v, &x));
932   for (i = 0; i < n; i++) {                                 /* loop over rows */
933     shift = a->sliidx[i / SLICE_HEIGHT] + i % SLICE_HEIGHT; /* starting index of the row i */
934     x[i]  = 0;
935     for (j = 0; j < a->rlen[i]; j++) {
936       if (a->colidx[shift + SLICE_HEIGHT * j] == i) {
937         x[i] = a->val[shift + SLICE_HEIGHT * j];
938         break;
939       }
940     }
941   }
942   PetscCall(VecRestoreArray(v, &x));
943   PetscFunctionReturn(PETSC_SUCCESS);
944 }
945 
946 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
947 {
948   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
949   const PetscScalar *l, *r;
950   PetscInt           i, j, m, n, row;
951 
952   PetscFunctionBegin;
953   if (ll) {
954     /* The local size is used so that VecMPI can be passed to this routine
955        by MatDiagonalScale_MPISELL */
956     PetscCall(VecGetLocalSize(ll, &m));
957     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
958     PetscCall(VecGetArrayRead(ll, &l));
959     for (i = 0; i < a->totalslices; i++) {                          /* loop over slices */
960       if (i == a->totalslices - 1 && (A->rmap->n % SLICE_HEIGHT)) { /* if last slice has padding rows */
961         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % SLICE_HEIGHT) {
962           if (row < (A->rmap->n % SLICE_HEIGHT)) a->val[j] *= l[SLICE_HEIGHT * i + row];
963         }
964       } else {
965         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % SLICE_HEIGHT) { a->val[j] *= l[SLICE_HEIGHT * i + row]; }
966       }
967     }
968     PetscCall(VecRestoreArrayRead(ll, &l));
969     PetscCall(PetscLogFlops(a->nz));
970   }
971   if (rr) {
972     PetscCall(VecGetLocalSize(rr, &n));
973     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
974     PetscCall(VecGetArrayRead(rr, &r));
975     for (i = 0; i < a->totalslices; i++) {                          /* loop over slices */
976       if (i == a->totalslices - 1 && (A->rmap->n % SLICE_HEIGHT)) { /* if last slice has padding rows */
977         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % SLICE_HEIGHT)) {
978           if (row < (A->rmap->n % SLICE_HEIGHT)) a->val[j] *= r[a->colidx[j]];
979         }
980       } else {
981         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
982       }
983     }
984     PetscCall(VecRestoreArrayRead(rr, &r));
985     PetscCall(PetscLogFlops(a->nz));
986   }
987   PetscCall(MatSeqSELLInvalidateDiagonal(A));
988 #if defined(PETSC_HAVE_CUDA)
989   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
990 #endif
991   PetscFunctionReturn(PETSC_SUCCESS);
992 }
993 
994 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
995 {
996   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
997   PetscInt    *cp, i, k, low, high, t, row, col, l;
998   PetscInt     shift;
999   MatScalar   *vp;
1000 
1001   PetscFunctionBegin;
1002   for (k = 0; k < m; k++) { /* loop over requested rows */
1003     row = im[k];
1004     if (row < 0) continue;
1005     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1006     shift = a->sliidx[row / SLICE_HEIGHT] + (row % SLICE_HEIGHT); /* starting index of the row */
1007     cp    = a->colidx + shift;                                    /* pointer to the row */
1008     vp    = a->val + shift;                                       /* pointer to the row */
1009     for (l = 0; l < n; l++) {                                     /* loop over requested columns */
1010       col = in[l];
1011       if (col < 0) continue;
1012       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1013       high = a->rlen[row];
1014       low  = 0; /* assume unsorted */
1015       while (high - low > 5) {
1016         t = (low + high) / 2;
1017         if (*(cp + SLICE_HEIGHT * t) > col) high = t;
1018         else low = t;
1019       }
1020       for (i = low; i < high; i++) {
1021         if (*(cp + SLICE_HEIGHT * i) > col) break;
1022         if (*(cp + SLICE_HEIGHT * i) == col) {
1023           *v++ = *(vp + SLICE_HEIGHT * i);
1024           goto finished;
1025         }
1026       }
1027       *v++ = 0.0;
1028     finished:;
1029     }
1030   }
1031   PetscFunctionReturn(PETSC_SUCCESS);
1032 }
1033 
1034 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1035 {
1036   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1037   PetscInt          i, j, m = A->rmap->n, shift;
1038   const char       *name;
1039   PetscViewerFormat format;
1040 
1041   PetscFunctionBegin;
1042   PetscCall(PetscViewerGetFormat(viewer, &format));
1043   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1044     PetscInt nofinalvalue = 0;
1045     /*
1046     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1047       nofinalvalue = 1;
1048     }
1049     */
1050     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1051     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1052     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1053 #if defined(PETSC_USE_COMPLEX)
1054     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1055 #else
1056     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1057 #endif
1058     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1059 
1060     for (i = 0; i < m; i++) {
1061       shift = a->sliidx[i / SLICE_HEIGHT] + i % SLICE_HEIGHT;
1062       for (j = 0; j < a->rlen[i]; j++) {
1063 #if defined(PETSC_USE_COMPLEX)
1064         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + SLICE_HEIGHT * j] + 1, (double)PetscRealPart(a->val[shift + SLICE_HEIGHT * j]), (double)PetscImaginaryPart(a->val[shift + SLICE_HEIGHT * j])));
1065 #else
1066         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + SLICE_HEIGHT * j] + 1, (double)a->val[shift + SLICE_HEIGHT * j]));
1067 #endif
1068       }
1069     }
1070     /*
1071     if (nofinalvalue) {
1072 #if defined(PETSC_USE_COMPLEX)
1073       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1074 #else
1075       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1076 #endif
1077     }
1078     */
1079     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1080     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1081     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1082   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1083     PetscFunctionReturn(PETSC_SUCCESS);
1084   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1085     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1086     for (i = 0; i < m; i++) {
1087       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1088       shift = a->sliidx[i / SLICE_HEIGHT] + i % SLICE_HEIGHT;
1089       for (j = 0; j < a->rlen[i]; j++) {
1090 #if defined(PETSC_USE_COMPLEX)
1091         if (PetscImaginaryPart(a->val[shift + SLICE_HEIGHT * j]) > 0.0 && PetscRealPart(a->val[shift + SLICE_HEIGHT * j]) != 0.0) {
1092           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + SLICE_HEIGHT * j], (double)PetscRealPart(a->val[shift + SLICE_HEIGHT * j]), (double)PetscImaginaryPart(a->val[shift + SLICE_HEIGHT * j])));
1093         } else if (PetscImaginaryPart(a->val[shift + SLICE_HEIGHT * j]) < 0.0 && PetscRealPart(a->val[shift + SLICE_HEIGHT * j]) != 0.0) {
1094           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + SLICE_HEIGHT * j], (double)PetscRealPart(a->val[shift + SLICE_HEIGHT * j]), (double)-PetscImaginaryPart(a->val[shift + SLICE_HEIGHT * j])));
1095         } else if (PetscRealPart(a->val[shift + SLICE_HEIGHT * j]) != 0.0) {
1096           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + SLICE_HEIGHT * j], (double)PetscRealPart(a->val[shift + SLICE_HEIGHT * j])));
1097         }
1098 #else
1099         if (a->val[shift + SLICE_HEIGHT * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + SLICE_HEIGHT * j], (double)a->val[shift + SLICE_HEIGHT * j]));
1100 #endif
1101       }
1102       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1103     }
1104     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1105   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1106     PetscInt    cnt = 0, jcnt;
1107     PetscScalar value;
1108 #if defined(PETSC_USE_COMPLEX)
1109     PetscBool realonly = PETSC_TRUE;
1110     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1111       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1112         realonly = PETSC_FALSE;
1113         break;
1114       }
1115     }
1116 #endif
1117 
1118     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1119     for (i = 0; i < m; i++) {
1120       jcnt  = 0;
1121       shift = a->sliidx[i / SLICE_HEIGHT] + i % SLICE_HEIGHT;
1122       for (j = 0; j < A->cmap->n; j++) {
1123         if (jcnt < a->rlen[i] && j == a->colidx[shift + SLICE_HEIGHT * j]) {
1124           value = a->val[cnt++];
1125           jcnt++;
1126         } else {
1127           value = 0.0;
1128         }
1129 #if defined(PETSC_USE_COMPLEX)
1130         if (realonly) {
1131           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1132         } else {
1133           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1134         }
1135 #else
1136         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1137 #endif
1138       }
1139       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1140     }
1141     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1142   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1143     PetscInt fshift = 1;
1144     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1145 #if defined(PETSC_USE_COMPLEX)
1146     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1147 #else
1148     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1149 #endif
1150     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1151     for (i = 0; i < m; i++) {
1152       shift = a->sliidx[i / SLICE_HEIGHT] + i % SLICE_HEIGHT;
1153       for (j = 0; j < a->rlen[i]; j++) {
1154 #if defined(PETSC_USE_COMPLEX)
1155         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + SLICE_HEIGHT * j] + fshift, (double)PetscRealPart(a->val[shift + SLICE_HEIGHT * j]), (double)PetscImaginaryPart(a->val[shift + SLICE_HEIGHT * j])));
1156 #else
1157         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + SLICE_HEIGHT * j] + fshift, (double)a->val[shift + SLICE_HEIGHT * j]));
1158 #endif
1159       }
1160     }
1161     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1162   } else if (format == PETSC_VIEWER_NATIVE) {
1163     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1164       PetscInt row;
1165       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1166       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % SLICE_HEIGHT) {
1167 #if defined(PETSC_USE_COMPLEX)
1168         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1169           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", SLICE_HEIGHT * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1170         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1171           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", SLICE_HEIGHT * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1172         } else {
1173           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", SLICE_HEIGHT * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1174         }
1175 #else
1176         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", SLICE_HEIGHT * i + row, a->colidx[j], (double)a->val[j]));
1177 #endif
1178       }
1179     }
1180   } else {
1181     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1182     if (A->factortype) {
1183       for (i = 0; i < m; i++) {
1184         shift = a->sliidx[i / SLICE_HEIGHT] + i % SLICE_HEIGHT;
1185         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1186         /* L part */
1187         for (j = shift; j < a->diag[i]; j += SLICE_HEIGHT) {
1188 #if defined(PETSC_USE_COMPLEX)
1189           if (PetscImaginaryPart(a->val[shift + SLICE_HEIGHT * j]) > 0.0) {
1190             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1191           } else if (PetscImaginaryPart(a->val[shift + SLICE_HEIGHT * j]) < 0.0) {
1192             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1193           } else {
1194             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1195           }
1196 #else
1197           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1198 #endif
1199         }
1200         /* diagonal */
1201         j = a->diag[i];
1202 #if defined(PETSC_USE_COMPLEX)
1203         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1204           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1205         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1206           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1207         } else {
1208           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1209         }
1210 #else
1211         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1212 #endif
1213 
1214         /* U part */
1215         for (j = a->diag[i] + 1; j < shift + SLICE_HEIGHT * a->rlen[i]; j += SLICE_HEIGHT) {
1216 #if defined(PETSC_USE_COMPLEX)
1217           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1218             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1219           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1220             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1221           } else {
1222             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1223           }
1224 #else
1225           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1226 #endif
1227         }
1228         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1229       }
1230     } else {
1231       for (i = 0; i < m; i++) {
1232         shift = a->sliidx[i / SLICE_HEIGHT] + i % SLICE_HEIGHT;
1233         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1234         for (j = 0; j < a->rlen[i]; j++) {
1235 #if defined(PETSC_USE_COMPLEX)
1236           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1237             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + SLICE_HEIGHT * j], (double)PetscRealPart(a->val[shift + SLICE_HEIGHT * j]), (double)PetscImaginaryPart(a->val[shift + SLICE_HEIGHT * j])));
1238           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1239             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + SLICE_HEIGHT * j], (double)PetscRealPart(a->val[shift + SLICE_HEIGHT * j]), (double)-PetscImaginaryPart(a->val[shift + SLICE_HEIGHT * j])));
1240           } else {
1241             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + SLICE_HEIGHT * j], (double)PetscRealPart(a->val[shift + SLICE_HEIGHT * j])));
1242           }
1243 #else
1244           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + SLICE_HEIGHT * j], (double)a->val[shift + SLICE_HEIGHT * j]));
1245 #endif
1246         }
1247         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1248       }
1249     }
1250     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1251   }
1252   PetscCall(PetscViewerFlush(viewer));
1253   PetscFunctionReturn(PETSC_SUCCESS);
1254 }
1255 
1256 #include <petscdraw.h>
1257 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1258 {
1259   Mat               A = (Mat)Aa;
1260   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1261   PetscInt          i, j, m = A->rmap->n, shift;
1262   int               color;
1263   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1264   PetscViewer       viewer;
1265   PetscViewerFormat format;
1266 
1267   PetscFunctionBegin;
1268   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1269   PetscCall(PetscViewerGetFormat(viewer, &format));
1270   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1271 
1272   /* loop over matrix elements drawing boxes */
1273 
1274   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1275     PetscDrawCollectiveBegin(draw);
1276     /* Blue for negative, Cyan for zero and  Red for positive */
1277     color = PETSC_DRAW_BLUE;
1278     for (i = 0; i < m; i++) {
1279       shift = a->sliidx[i / SLICE_HEIGHT] + i % SLICE_HEIGHT; /* starting index of the row i */
1280       y_l   = m - i - 1.0;
1281       y_r   = y_l + 1.0;
1282       for (j = 0; j < a->rlen[i]; j++) {
1283         x_l = a->colidx[shift + SLICE_HEIGHT * j];
1284         x_r = x_l + 1.0;
1285         if (PetscRealPart(a->val[shift + SLICE_HEIGHT * j]) >= 0.) continue;
1286         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1287       }
1288     }
1289     color = PETSC_DRAW_CYAN;
1290     for (i = 0; i < m; i++) {
1291       shift = a->sliidx[i / SLICE_HEIGHT] + i % SLICE_HEIGHT;
1292       y_l   = m - i - 1.0;
1293       y_r   = y_l + 1.0;
1294       for (j = 0; j < a->rlen[i]; j++) {
1295         x_l = a->colidx[shift + SLICE_HEIGHT * j];
1296         x_r = x_l + 1.0;
1297         if (a->val[shift + SLICE_HEIGHT * j] != 0.) continue;
1298         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1299       }
1300     }
1301     color = PETSC_DRAW_RED;
1302     for (i = 0; i < m; i++) {
1303       shift = a->sliidx[i / SLICE_HEIGHT] + i % SLICE_HEIGHT;
1304       y_l   = m - i - 1.0;
1305       y_r   = y_l + 1.0;
1306       for (j = 0; j < a->rlen[i]; j++) {
1307         x_l = a->colidx[shift + SLICE_HEIGHT * j];
1308         x_r = x_l + 1.0;
1309         if (PetscRealPart(a->val[shift + SLICE_HEIGHT * j]) <= 0.) continue;
1310         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1311       }
1312     }
1313     PetscDrawCollectiveEnd(draw);
1314   } else {
1315     /* use contour shading to indicate magnitude of values */
1316     /* first determine max of all nonzero values */
1317     PetscReal minv = 0.0, maxv = 0.0;
1318     PetscInt  count = 0;
1319     PetscDraw popup;
1320     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1321       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1322     }
1323     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1324     PetscCall(PetscDrawGetPopup(draw, &popup));
1325     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1326 
1327     PetscDrawCollectiveBegin(draw);
1328     for (i = 0; i < m; i++) {
1329       shift = a->sliidx[i / SLICE_HEIGHT] + i % SLICE_HEIGHT;
1330       y_l   = m - i - 1.0;
1331       y_r   = y_l + 1.0;
1332       for (j = 0; j < a->rlen[i]; j++) {
1333         x_l   = a->colidx[shift + SLICE_HEIGHT * j];
1334         x_r   = x_l + 1.0;
1335         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1336         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1337         count++;
1338       }
1339     }
1340     PetscDrawCollectiveEnd(draw);
1341   }
1342   PetscFunctionReturn(PETSC_SUCCESS);
1343 }
1344 
1345 #include <petscdraw.h>
1346 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1347 {
1348   PetscDraw draw;
1349   PetscReal xr, yr, xl, yl, h, w;
1350   PetscBool isnull;
1351 
1352   PetscFunctionBegin;
1353   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1354   PetscCall(PetscDrawIsNull(draw, &isnull));
1355   if (isnull) PetscFunctionReturn(PETSC_SUCCESS);
1356 
1357   xr = A->cmap->n;
1358   yr = A->rmap->n;
1359   h  = yr / 10.0;
1360   w  = xr / 10.0;
1361   xr += w;
1362   yr += h;
1363   xl = -w;
1364   yl = -h;
1365   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1366   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1367   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1368   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1369   PetscCall(PetscDrawSave(draw));
1370   PetscFunctionReturn(PETSC_SUCCESS);
1371 }
1372 
1373 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1374 {
1375   PetscBool iascii, isbinary, isdraw;
1376 
1377   PetscFunctionBegin;
1378   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1379   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1380   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1381   if (iascii) {
1382     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1383   } else if (isbinary) {
1384     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1385   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1386   PetscFunctionReturn(PETSC_SUCCESS);
1387 }
1388 
1389 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1390 {
1391   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1392   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1393   MatScalar   *vp;
1394 
1395   PetscFunctionBegin;
1396   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
1397   /* To do: compress out the unused elements */
1398   PetscCall(MatMarkDiagonal_SeqSELL(A));
1399   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1400   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1401   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1402   a->nonzerorowcnt = 0;
1403   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1404   for (i = 0; i < a->totalslices; ++i) {
1405     shift = a->sliidx[i];                                                 /* starting index of the slice */
1406     cp    = a->colidx + shift;                                            /* pointer to the column indices of the slice */
1407     vp    = a->val + shift;                                               /* pointer to the nonzero values of the slice */
1408     for (row_in_slice = 0; row_in_slice < SLICE_HEIGHT; ++row_in_slice) { /* loop over rows in the slice */
1409       row  = SLICE_HEIGHT * i + row_in_slice;
1410       nrow = a->rlen[row]; /* number of nonzeros in row */
1411       /*
1412         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1413         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1414       */
1415       lastcol = 0;
1416       if (nrow > 0) { /* nonempty row */
1417         a->nonzerorowcnt++;
1418         lastcol = cp[SLICE_HEIGHT * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1419       } else if (!row_in_slice) {                               /* first row of the correct slice is empty */
1420         for (j = 1; j < SLICE_HEIGHT; j++) {
1421           if (a->rlen[SLICE_HEIGHT * i + j]) {
1422             lastcol = cp[j];
1423             break;
1424           }
1425         }
1426       } else {
1427         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1428       }
1429 
1430       for (k = nrow; k < (a->sliidx[i + 1] - shift) / SLICE_HEIGHT; ++k) {
1431         cp[SLICE_HEIGHT * k + row_in_slice] = lastcol;
1432         vp[SLICE_HEIGHT * k + row_in_slice] = (MatScalar)0;
1433       }
1434     }
1435   }
1436 
1437   A->info.mallocs += a->reallocs;
1438   a->reallocs = 0;
1439 
1440   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1441   PetscFunctionReturn(PETSC_SUCCESS);
1442 }
1443 
1444 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1445 {
1446   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1447 
1448   PetscFunctionBegin;
1449   info->block_size   = 1.0;
1450   info->nz_allocated = a->maxallocmat;
1451   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1452   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1453   info->assemblies   = A->num_ass;
1454   info->mallocs      = A->info.mallocs;
1455   info->memory       = 0; /* REVIEW ME */
1456   if (A->factortype) {
1457     info->fill_ratio_given  = A->info.fill_ratio_given;
1458     info->fill_ratio_needed = A->info.fill_ratio_needed;
1459     info->factor_mallocs    = A->info.factor_mallocs;
1460   } else {
1461     info->fill_ratio_given  = 0;
1462     info->fill_ratio_needed = 0;
1463     info->factor_mallocs    = 0;
1464   }
1465   PetscFunctionReturn(PETSC_SUCCESS);
1466 }
1467 
1468 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1469 {
1470   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1471   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1472   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1473   MatScalar   *vp, value;
1474 #if defined(PETSC_HAVE_CUDA)
1475   PetscBool inserted = PETSC_FALSE;
1476 #endif
1477 
1478   PetscFunctionBegin;
1479   for (k = 0; k < m; k++) { /* loop over added rows */
1480     row = im[k];
1481     if (row < 0) continue;
1482     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1483     shift = a->sliidx[row / SLICE_HEIGHT] + row % SLICE_HEIGHT; /* starting index of the row */
1484     cp    = a->colidx + shift;                                  /* pointer to the row */
1485     vp    = a->val + shift;                                     /* pointer to the row */
1486     nrow  = a->rlen[row];
1487     low   = 0;
1488     high  = nrow;
1489 
1490     for (l = 0; l < n; l++) { /* loop over added columns */
1491       col = in[l];
1492       if (col < 0) continue;
1493       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1494       if (a->roworiented) {
1495         value = v[l + k * n];
1496       } else {
1497         value = v[k + l * m];
1498       }
1499       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1500 
1501       /* search in this row for the specified column, i indicates the column to be set */
1502       if (col <= lastcol) low = 0;
1503       else high = nrow;
1504       lastcol = col;
1505       while (high - low > 5) {
1506         t = (low + high) / 2;
1507         if (*(cp + SLICE_HEIGHT * t) > col) high = t;
1508         else low = t;
1509       }
1510       for (i = low; i < high; i++) {
1511         if (*(cp + SLICE_HEIGHT * i) > col) break;
1512         if (*(cp + SLICE_HEIGHT * i) == col) {
1513           if (is == ADD_VALUES) *(vp + SLICE_HEIGHT * i) += value;
1514           else *(vp + SLICE_HEIGHT * i) = value;
1515 #if defined(PETSC_HAVE_CUDA)
1516           inserted = PETSC_TRUE;
1517 #endif
1518           low = i + 1;
1519           goto noinsert;
1520         }
1521       }
1522       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1523       if (nonew == 1) goto noinsert;
1524       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1525       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1526       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / SLICE_HEIGHT, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar);
1527       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1528       for (ii = nrow - 1; ii >= i; ii--) {
1529         *(cp + SLICE_HEIGHT * (ii + 1)) = *(cp + SLICE_HEIGHT * ii);
1530         *(vp + SLICE_HEIGHT * (ii + 1)) = *(vp + SLICE_HEIGHT * ii);
1531       }
1532       a->rlen[row]++;
1533       *(cp + SLICE_HEIGHT * i) = col;
1534       *(vp + SLICE_HEIGHT * i) = value;
1535       a->nz++;
1536       A->nonzerostate++;
1537 #if defined(PETSC_HAVE_CUDA)
1538       inserted = PETSC_TRUE;
1539 #endif
1540       low = i + 1;
1541       high++;
1542       nrow++;
1543     noinsert:;
1544     }
1545     a->rlen[row] = nrow;
1546   }
1547 #if defined(PETSC_HAVE_CUDA)
1548   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU;
1549 #endif
1550   PetscFunctionReturn(PETSC_SUCCESS);
1551 }
1552 
1553 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1554 {
1555   PetscFunctionBegin;
1556   /* If the two matrices have the same copy implementation, use fast copy. */
1557   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1558     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1559     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1560 
1561     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1562     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1563   } else {
1564     PetscCall(MatCopy_Basic(A, B, str));
1565   }
1566   PetscFunctionReturn(PETSC_SUCCESS);
1567 }
1568 
1569 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1570 {
1571   PetscFunctionBegin;
1572   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1573   PetscFunctionReturn(PETSC_SUCCESS);
1574 }
1575 
1576 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1577 {
1578   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1579 
1580   PetscFunctionBegin;
1581   *array = a->val;
1582   PetscFunctionReturn(PETSC_SUCCESS);
1583 }
1584 
1585 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1586 {
1587   PetscFunctionBegin;
1588   PetscFunctionReturn(PETSC_SUCCESS);
1589 }
1590 
1591 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1592 {
1593   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1594   PetscInt     i;
1595   MatScalar   *aval = a->val;
1596 
1597   PetscFunctionBegin;
1598   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1599 #if defined(PETSC_HAVE_CUDA)
1600   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1601 #endif
1602   PetscFunctionReturn(PETSC_SUCCESS);
1603 }
1604 
1605 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1606 {
1607   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1608   PetscInt     i;
1609   MatScalar   *aval = a->val;
1610 
1611   PetscFunctionBegin;
1612   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1613   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1614 #if defined(PETSC_HAVE_CUDA)
1615   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
1616 #endif
1617   PetscFunctionReturn(PETSC_SUCCESS);
1618 }
1619 
1620 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1621 {
1622   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1623   MatScalar   *aval   = a->val;
1624   PetscScalar  oalpha = alpha;
1625   PetscBLASInt one    = 1, size;
1626 
1627   PetscFunctionBegin;
1628   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1629   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1630   PetscCall(PetscLogFlops(a->nz));
1631   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1632 #if defined(PETSC_HAVE_CUDA)
1633   if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU;
1634 #endif
1635   PetscFunctionReturn(PETSC_SUCCESS);
1636 }
1637 
1638 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1639 {
1640   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1641 
1642   PetscFunctionBegin;
1643   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1644   PetscCall(MatShift_Basic(Y, a));
1645   PetscFunctionReturn(PETSC_SUCCESS);
1646 }
1647 
1648 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1649 {
1650   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1651   PetscScalar       *x, sum, *t;
1652   const MatScalar   *idiag = NULL, *mdiag;
1653   const PetscScalar *b, *xb;
1654   PetscInt           n, m = A->rmap->n, i, j, shift;
1655   const PetscInt    *diag;
1656 
1657   PetscFunctionBegin;
1658   its = its * lits;
1659 
1660   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1661   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1662   a->fshift = fshift;
1663   a->omega  = omega;
1664 
1665   diag  = a->diag;
1666   t     = a->ssor_work;
1667   idiag = a->idiag;
1668   mdiag = a->mdiag;
1669 
1670   PetscCall(VecGetArray(xx, &x));
1671   PetscCall(VecGetArrayRead(bb, &b));
1672   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1673   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1674   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1675   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1676 
1677   if (flag & SOR_ZERO_INITIAL_GUESS) {
1678     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1679       for (i = 0; i < m; i++) {
1680         shift = a->sliidx[i / SLICE_HEIGHT] + i % SLICE_HEIGHT; /* starting index of the row i */
1681         sum   = b[i];
1682         n     = (diag[i] - shift) / SLICE_HEIGHT;
1683         for (j = 0; j < n; j++) sum -= a->val[shift + SLICE_HEIGHT * j] * x[a->colidx[shift + SLICE_HEIGHT * j]];
1684         t[i] = sum;
1685         x[i] = sum * idiag[i];
1686       }
1687       xb = t;
1688       PetscCall(PetscLogFlops(a->nz));
1689     } else xb = b;
1690     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1691       for (i = m - 1; i >= 0; i--) {
1692         shift = a->sliidx[i / SLICE_HEIGHT] + i % SLICE_HEIGHT; /* starting index of the row i */
1693         sum   = xb[i];
1694         n     = a->rlen[i] - (diag[i] - shift) / SLICE_HEIGHT - 1;
1695         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + SLICE_HEIGHT * j] * x[a->colidx[diag[i] + SLICE_HEIGHT * j]];
1696         if (xb == b) {
1697           x[i] = sum * idiag[i];
1698         } else {
1699           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1700         }
1701       }
1702       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1703     }
1704     its--;
1705   }
1706   while (its--) {
1707     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1708       for (i = 0; i < m; i++) {
1709         /* lower */
1710         shift = a->sliidx[i / SLICE_HEIGHT] + i % SLICE_HEIGHT; /* starting index of the row i */
1711         sum   = b[i];
1712         n     = (diag[i] - shift) / SLICE_HEIGHT;
1713         for (j = 0; j < n; j++) sum -= a->val[shift + SLICE_HEIGHT * j] * x[a->colidx[shift + SLICE_HEIGHT * j]];
1714         t[i] = sum; /* save application of the lower-triangular part */
1715         /* upper */
1716         n = a->rlen[i] - (diag[i] - shift) / SLICE_HEIGHT - 1;
1717         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + SLICE_HEIGHT * j] * x[a->colidx[diag[i] + SLICE_HEIGHT * j]];
1718         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1719       }
1720       xb = t;
1721       PetscCall(PetscLogFlops(2.0 * a->nz));
1722     } else xb = b;
1723     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1724       for (i = m - 1; i >= 0; i--) {
1725         shift = a->sliidx[i / SLICE_HEIGHT] + i % SLICE_HEIGHT; /* starting index of the row i */
1726         sum   = xb[i];
1727         if (xb == b) {
1728           /* whole matrix (no checkpointing available) */
1729           n = a->rlen[i];
1730           for (j = 0; j < n; j++) sum -= a->val[shift + SLICE_HEIGHT * j] * x[a->colidx[shift + SLICE_HEIGHT * j]];
1731           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1732         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1733           n = a->rlen[i] - (diag[i] - shift) / SLICE_HEIGHT - 1;
1734           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + SLICE_HEIGHT * j] * x[a->colidx[diag[i] + SLICE_HEIGHT * j]];
1735           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1736         }
1737       }
1738       if (xb == b) {
1739         PetscCall(PetscLogFlops(2.0 * a->nz));
1740       } else {
1741         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1742       }
1743     }
1744   }
1745   PetscCall(VecRestoreArray(xx, &x));
1746   PetscCall(VecRestoreArrayRead(bb, &b));
1747   PetscFunctionReturn(PETSC_SUCCESS);
1748 }
1749 
1750 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1751                                        MatGetRow_SeqSELL,
1752                                        MatRestoreRow_SeqSELL,
1753                                        MatMult_SeqSELL,
1754                                        /* 4*/ MatMultAdd_SeqSELL,
1755                                        MatMultTranspose_SeqSELL,
1756                                        MatMultTransposeAdd_SeqSELL,
1757                                        NULL,
1758                                        NULL,
1759                                        NULL,
1760                                        /* 10*/ NULL,
1761                                        NULL,
1762                                        NULL,
1763                                        MatSOR_SeqSELL,
1764                                        NULL,
1765                                        /* 15*/ MatGetInfo_SeqSELL,
1766                                        MatEqual_SeqSELL,
1767                                        MatGetDiagonal_SeqSELL,
1768                                        MatDiagonalScale_SeqSELL,
1769                                        NULL,
1770                                        /* 20*/ NULL,
1771                                        MatAssemblyEnd_SeqSELL,
1772                                        MatSetOption_SeqSELL,
1773                                        MatZeroEntries_SeqSELL,
1774                                        /* 24*/ NULL,
1775                                        NULL,
1776                                        NULL,
1777                                        NULL,
1778                                        NULL,
1779                                        /* 29*/ MatSetUp_SeqSELL,
1780                                        NULL,
1781                                        NULL,
1782                                        NULL,
1783                                        NULL,
1784                                        /* 34*/ MatDuplicate_SeqSELL,
1785                                        NULL,
1786                                        NULL,
1787                                        NULL,
1788                                        NULL,
1789                                        /* 39*/ NULL,
1790                                        NULL,
1791                                        NULL,
1792                                        MatGetValues_SeqSELL,
1793                                        MatCopy_SeqSELL,
1794                                        /* 44*/ NULL,
1795                                        MatScale_SeqSELL,
1796                                        MatShift_SeqSELL,
1797                                        NULL,
1798                                        NULL,
1799                                        /* 49*/ NULL,
1800                                        NULL,
1801                                        NULL,
1802                                        NULL,
1803                                        NULL,
1804                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1805                                        NULL,
1806                                        NULL,
1807                                        NULL,
1808                                        NULL,
1809                                        /* 59*/ NULL,
1810                                        MatDestroy_SeqSELL,
1811                                        MatView_SeqSELL,
1812                                        NULL,
1813                                        NULL,
1814                                        /* 64*/ NULL,
1815                                        NULL,
1816                                        NULL,
1817                                        NULL,
1818                                        NULL,
1819                                        /* 69*/ NULL,
1820                                        NULL,
1821                                        NULL,
1822                                        NULL,
1823                                        NULL,
1824                                        /* 74*/ NULL,
1825                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1826                                        NULL,
1827                                        NULL,
1828                                        NULL,
1829                                        /* 79*/ NULL,
1830                                        NULL,
1831                                        NULL,
1832                                        NULL,
1833                                        NULL,
1834                                        /* 84*/ NULL,
1835                                        NULL,
1836                                        NULL,
1837                                        NULL,
1838                                        NULL,
1839                                        /* 89*/ NULL,
1840                                        NULL,
1841                                        NULL,
1842                                        NULL,
1843                                        NULL,
1844                                        /* 94*/ NULL,
1845                                        NULL,
1846                                        NULL,
1847                                        NULL,
1848                                        NULL,
1849                                        /* 99*/ NULL,
1850                                        NULL,
1851                                        NULL,
1852                                        MatConjugate_SeqSELL,
1853                                        NULL,
1854                                        /*104*/ NULL,
1855                                        NULL,
1856                                        NULL,
1857                                        NULL,
1858                                        NULL,
1859                                        /*109*/ NULL,
1860                                        NULL,
1861                                        NULL,
1862                                        NULL,
1863                                        MatMissingDiagonal_SeqSELL,
1864                                        /*114*/ NULL,
1865                                        NULL,
1866                                        NULL,
1867                                        NULL,
1868                                        NULL,
1869                                        /*119*/ NULL,
1870                                        NULL,
1871                                        NULL,
1872                                        NULL,
1873                                        NULL,
1874                                        /*124*/ NULL,
1875                                        NULL,
1876                                        NULL,
1877                                        NULL,
1878                                        NULL,
1879                                        /*129*/ NULL,
1880                                        NULL,
1881                                        NULL,
1882                                        NULL,
1883                                        NULL,
1884                                        /*134*/ NULL,
1885                                        NULL,
1886                                        NULL,
1887                                        NULL,
1888                                        NULL,
1889                                        /*139*/ NULL,
1890                                        NULL,
1891                                        NULL,
1892                                        MatFDColoringSetUp_SeqXAIJ,
1893                                        NULL,
1894                                        /*144*/ NULL,
1895                                        NULL,
1896                                        NULL,
1897                                        NULL,
1898                                        NULL,
1899                                        NULL,
1900                                        /*150*/ NULL,
1901                                        NULL};
1902 
1903 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1904 {
1905   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1906 
1907   PetscFunctionBegin;
1908   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1909 
1910   /* allocate space for values if not already there */
1911   if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values));
1912 
1913   /* copy values over */
1914   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1915   PetscFunctionReturn(PETSC_SUCCESS);
1916 }
1917 
1918 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1919 {
1920   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1921 
1922   PetscFunctionBegin;
1923   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1924   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1925   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1926   PetscFunctionReturn(PETSC_SUCCESS);
1927 }
1928 
1929 /*@C
1930  MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()`
1931 
1932  Not Collective
1933 
1934  Input Parameters:
1935 +  mat - a `MATSEQSELL` matrix
1936 -  array - pointer to the data
1937 
1938  Level: intermediate
1939 
1940  .seealso: `Mat`, `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
1941  @*/
1942 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array)
1943 {
1944   PetscFunctionBegin;
1945   PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
1946   PetscFunctionReturn(PETSC_SUCCESS);
1947 }
1948 
1949 #if defined(PETSC_HAVE_CUDA)
1950 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat);
1951 #endif
1952 
1953 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1954 {
1955   Mat_SeqSELL *b;
1956   PetscMPIInt  size;
1957 
1958   PetscFunctionBegin;
1959   PetscCall(PetscCitationsRegister(citation, &cited));
1960   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
1961   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
1962 
1963   PetscCall(PetscNew(&b));
1964 
1965   B->data = (void *)b;
1966 
1967   PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps)));
1968 
1969   b->row                = NULL;
1970   b->col                = NULL;
1971   b->icol               = NULL;
1972   b->reallocs           = 0;
1973   b->ignorezeroentries  = PETSC_FALSE;
1974   b->roworiented        = PETSC_TRUE;
1975   b->nonew              = 0;
1976   b->diag               = NULL;
1977   b->solve_work         = NULL;
1978   B->spptr              = NULL;
1979   b->saved_values       = NULL;
1980   b->idiag              = NULL;
1981   b->mdiag              = NULL;
1982   b->ssor_work          = NULL;
1983   b->omega              = 1.0;
1984   b->fshift             = 0.0;
1985   b->idiagvalid         = PETSC_FALSE;
1986   b->keepnonzeropattern = PETSC_FALSE;
1987 
1988   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
1989   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
1990   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
1991   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
1992   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
1993   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
1994   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_SeqSELL_SeqAIJ_C", MatConvert_SeqSELL_SeqAIJ));
1995 #if defined(PETSC_HAVE_CUDA)
1996   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_SeqSELL_SeqSELLCUDA_C", MatConvert_SeqSELL_SeqSELLCUDA));
1997 #endif
1998   PetscFunctionReturn(PETSC_SUCCESS);
1999 }
2000 
2001 /*
2002  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
2003  */
2004 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
2005 {
2006   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
2007   PetscInt     i, m                           = A->rmap->n;
2008   PetscInt     totalslices = a->totalslices;
2009 
2010   PetscFunctionBegin;
2011   C->factortype = A->factortype;
2012   c->row        = NULL;
2013   c->col        = NULL;
2014   c->icol       = NULL;
2015   c->reallocs   = 0;
2016   C->assembled  = PETSC_TRUE;
2017 
2018   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2019   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2020 
2021   PetscCall(PetscMalloc1(SLICE_HEIGHT * totalslices, &c->rlen));
2022   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2023 
2024   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2025   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2026 
2027   /* allocate the matrix space */
2028   if (mallocmatspace) {
2029     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2030 
2031     c->singlemalloc = PETSC_TRUE;
2032 
2033     if (m > 0) {
2034       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2035       if (cpvalues == MAT_COPY_VALUES) {
2036         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2037       } else {
2038         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2039       }
2040     }
2041   }
2042 
2043   c->ignorezeroentries = a->ignorezeroentries;
2044   c->roworiented       = a->roworiented;
2045   c->nonew             = a->nonew;
2046   if (a->diag) {
2047     PetscCall(PetscMalloc1(m, &c->diag));
2048     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2049   } else c->diag = NULL;
2050 
2051   c->solve_work         = NULL;
2052   c->saved_values       = NULL;
2053   c->idiag              = NULL;
2054   c->ssor_work          = NULL;
2055   c->keepnonzeropattern = a->keepnonzeropattern;
2056   c->free_val           = PETSC_TRUE;
2057   c->free_colidx        = PETSC_TRUE;
2058 
2059   c->maxallocmat  = a->maxallocmat;
2060   c->maxallocrow  = a->maxallocrow;
2061   c->rlenmax      = a->rlenmax;
2062   c->nz           = a->nz;
2063   C->preallocated = PETSC_TRUE;
2064 
2065   c->nonzerorowcnt = a->nonzerorowcnt;
2066   C->nonzerostate  = A->nonzerostate;
2067 
2068   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2069   PetscFunctionReturn(PETSC_SUCCESS);
2070 }
2071 
2072 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2073 {
2074   PetscFunctionBegin;
2075   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2076   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2077   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2078   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2079   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2080   PetscFunctionReturn(PETSC_SUCCESS);
2081 }
2082 
2083 /*MC
2084    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2085    based on the sliced Ellpack format
2086 
2087    Options Database Key:
2088 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2089 
2090    Level: beginner
2091 
2092 .seealso: `Mat`, `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2093 M*/
2094 
2095 /*MC
2096    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2097 
2098    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2099    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2100   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2101   for communicators controlling multiple processes.  It is recommended that you call both of
2102   the above preallocation routines for simplicity.
2103 
2104    Options Database Key:
2105 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2106 
2107   Level: beginner
2108 
2109   Notes:
2110    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2111 
2112    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2113    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2114 
2115   Developer Notes:
2116    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2117 
2118    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2119 .vb
2120                             (2 0  3 4)
2121    Consider the matrix A =  (5 0  6 0)
2122                             (0 0  7 8)
2123                             (0 0  9 9)
2124 
2125    symbolically the Ellpack format can be written as
2126 
2127         (2 3 4 |)           (0 2 3 |)
2128    v =  (5 6 0 |)  colidx = (0 2 2 |)
2129         --------            ---------
2130         (7 8 |)             (2 3 |)
2131         (9 9 |)             (2 3 |)
2132 
2133     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2134     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2135     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2136 
2137     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2138 
2139 .ve
2140 
2141       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2142 
2143  References:
2144 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2145    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2146 
2147 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2148 M*/
2149 
2150 /*@C
2151        MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2152 
2153  Collective
2154 
2155  Input Parameters:
2156 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
2157 .  m - number of rows
2158 .  n - number of columns
2159 .  rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided
2160 -  rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL
2161 
2162  Output Parameter:
2163 .  A - the matrix
2164 
2165  Level: intermediate
2166 
2167  Notes:
2168  It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2169  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2170  [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2171 
2172  Specify the preallocated storage with either `rlenmax` or `rlen` (not both).
2173  Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory
2174  allocation.
2175 
2176  .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2177  @*/
2178 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A)
2179 {
2180   PetscFunctionBegin;
2181   PetscCall(MatCreate(comm, A));
2182   PetscCall(MatSetSizes(*A, m, n, m, n));
2183   PetscCall(MatSetType(*A, MATSEQSELL));
2184   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen));
2185   PetscFunctionReturn(PETSC_SUCCESS);
2186 }
2187 
2188 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2189 {
2190   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2191   PetscInt     totalslices = a->totalslices;
2192 
2193   PetscFunctionBegin;
2194   /* If the  matrix dimensions are not equal,or no of nonzeros */
2195   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2196     *flg = PETSC_FALSE;
2197     PetscFunctionReturn(PETSC_SUCCESS);
2198   }
2199   /* if the a->colidx are the same */
2200   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2201   if (!*flg) PetscFunctionReturn(PETSC_SUCCESS);
2202   /* if a->val are the same */
2203   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2204   PetscFunctionReturn(PETSC_SUCCESS);
2205 }
2206 
2207 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2208 {
2209   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2210 
2211   PetscFunctionBegin;
2212   a->idiagvalid = PETSC_FALSE;
2213   PetscFunctionReturn(PETSC_SUCCESS);
2214 }
2215 
2216 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2217 {
2218 #if defined(PETSC_USE_COMPLEX)
2219   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2220   PetscInt     i;
2221   PetscScalar *val = a->val;
2222 
2223   PetscFunctionBegin;
2224   for (i = 0; i < a->sliidx[a->totalslices]; i++) { val[i] = PetscConj(val[i]); }
2225   #if defined(PETSC_HAVE_CUDA)
2226   if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU;
2227   #endif
2228 #else
2229   PetscFunctionBegin;
2230 #endif
2231   PetscFunctionReturn(PETSC_SUCCESS);
2232 }
2233