xref: /petsc/src/mat/impls/sell/seq/sell.c (revision cedd07cade5cbdfdad435c8172b7ec8972d9cd8d)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h> /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited      = PETSC_FALSE;
10 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n"
11                                " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
12                                " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
13                                " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
14                                " year = 2018\n"
15                                "}\n";
16 
17 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
18 
19   #include <immintrin.h>
20 
21   #if !defined(_MM_SCALE_8)
22     #define _MM_SCALE_8 8
23   #endif
24 
25   #if defined(__AVX512F__)
26     /* these do not work
27    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
28    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
29   */
30     #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
31       /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
32       vec_idx  = _mm256_loadu_si256((__m256i const *)acolidx); \
33       vec_vals = _mm512_loadu_pd(aval); \
34       vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \
35       vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y)
36   #elif defined(__AVX2__) && defined(__FMA__)
37     #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \
38       vec_vals = _mm256_loadu_pd(aval); \
39       vec_idx  = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \
40       vec_x    = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \
41       vec_y    = _mm256_fmadd_pd(vec_x, vec_vals, vec_y)
42   #endif
43 #endif /* PETSC_HAVE_IMMINTRIN_H */
44 
45 /*@C
46  MatSeqSELLSetPreallocation - For good matrix assembly performance
47  the user should preallocate the matrix storage by setting the parameter nz
48  (or the array nnz).  By setting these parameters accurately, performance
49  during matrix assembly can be increased significantly.
50 
51  Collective
52 
53  Input Parameters:
54  +  B - The `MATSEQSELL` matrix
55  .  nz - number of nonzeros per row (same for all rows)
56  -  nnz - array containing the number of nonzeros in the various rows
57  (possibly different for each row) or NULL
58 
59  Notes:
60  If nnz is given then nz is ignored.
61 
62  Specify the preallocated storage with either nz or nnz (not both).
63  Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
64  allocation.  For large problems you MUST preallocate memory or you
65  will get TERRIBLE performance, see the users' manual chapter on matrices.
66 
67  You can call `MatGetInfo()` to get information on how effective the preallocation was;
68  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
69  You can also run with the option -info and look for messages with the string
70  malloc in them to see if additional memory allocation was needed.
71 
72  Developers: Use nz of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix
73  entries or columns indices.
74 
75  The maximum number of nonzeos in any row should be as accurate as possible.
76  If it is underestimated, you will get bad performance due to reallocation
77  (MatSeqXSELLReallocateSELL).
78 
79  Level: intermediate
80 
81  .seealso: `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
82  @*/
83 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[])
84 {
85   PetscFunctionBegin;
86   PetscValidHeaderSpecific(B, MAT_CLASSID, 1);
87   PetscValidType(B, 1);
88   PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen));
89   PetscFunctionReturn(0);
90 }
91 
92 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[])
93 {
94   Mat_SeqSELL *b;
95   PetscInt     i, j, totalslices;
96   PetscBool    skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE;
97 
98   PetscFunctionBegin;
99   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
100   if (maxallocrow == MAT_SKIP_ALLOCATION) {
101     skipallocation = PETSC_TRUE;
102     maxallocrow    = 0;
103   }
104 
105   PetscCall(PetscLayoutSetUp(B->rmap));
106   PetscCall(PetscLayoutSetUp(B->cmap));
107 
108   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
109   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
110   PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow);
111   if (rlen) {
112     for (i = 0; i < B->rmap->n; i++) {
113       PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]);
114       PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n);
115     }
116   }
117 
118   B->preallocated = PETSC_TRUE;
119 
120   b = (Mat_SeqSELL *)B->data;
121 
122   totalslices    = PetscCeilInt(B->rmap->n, 8);
123   b->totalslices = totalslices;
124   if (!skipallocation) {
125     if (B->rmap->n & 0x07) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n", B->rmap->n));
126 
127     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
128       PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx));
129     }
130     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
131       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
132       else if (maxallocrow < 0) maxallocrow = 1;
133       for (i = 0; i <= totalslices; i++) b->sliidx[i] = i * 8 * maxallocrow;
134     } else {
135       maxallocrow  = 0;
136       b->sliidx[0] = 0;
137       for (i = 1; i < totalslices; i++) {
138         b->sliidx[i] = 0;
139         for (j = 0; j < 8; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[8 * (i - 1) + j]);
140         maxallocrow = PetscMax(b->sliidx[i], maxallocrow);
141         PetscCall(PetscIntSumError(b->sliidx[i - 1], 8 * b->sliidx[i], &b->sliidx[i]));
142       }
143       /* last slice */
144       b->sliidx[totalslices] = 0;
145       for (j = (totalslices - 1) * 8; j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]);
146       maxallocrow            = PetscMax(b->sliidx[totalslices], maxallocrow);
147       b->sliidx[totalslices] = b->sliidx[totalslices - 1] + 8 * b->sliidx[totalslices];
148     }
149 
150     /* allocate space for val, colidx, rlen */
151     /* FIXME: should B's old memory be unlogged? */
152     PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx));
153     /* FIXME: assuming an element of the bit array takes 8 bits */
154     PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx));
155     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
156     PetscCall(PetscCalloc1(8 * totalslices, &b->rlen));
157 
158     b->singlemalloc = PETSC_TRUE;
159     b->free_val     = PETSC_TRUE;
160     b->free_colidx  = PETSC_TRUE;
161   } else {
162     b->free_val    = PETSC_FALSE;
163     b->free_colidx = PETSC_FALSE;
164   }
165 
166   b->nz               = 0;
167   b->maxallocrow      = maxallocrow;
168   b->rlenmax          = maxallocrow;
169   b->maxallocmat      = b->sliidx[totalslices];
170   B->info.nz_unneeded = (double)b->maxallocmat;
171   if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
172   PetscFunctionReturn(0);
173 }
174 
175 PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
176 {
177   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
178   PetscInt     shift;
179 
180   PetscFunctionBegin;
181   PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row);
182   if (nz) *nz = a->rlen[row];
183   shift = a->sliidx[row >> 3] + (row & 0x07);
184   if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals));
185   if (idx) {
186     PetscInt j;
187     for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + 8 * j];
188     *idx = a->getrowcols;
189   }
190   if (v) {
191     PetscInt j;
192     for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + 8 * j];
193     *v = a->getrowvals;
194   }
195   PetscFunctionReturn(0);
196 }
197 
198 PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v)
199 {
200   PetscFunctionBegin;
201   PetscFunctionReturn(0);
202 }
203 
204 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
205 {
206   Mat          B;
207   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
208   PetscInt     i;
209 
210   PetscFunctionBegin;
211   if (reuse == MAT_REUSE_MATRIX) {
212     B = *newmat;
213     PetscCall(MatZeroEntries(B));
214   } else {
215     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
216     PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N));
217     PetscCall(MatSetType(B, MATSEQAIJ));
218     PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen));
219   }
220 
221   for (i = 0; i < A->rmap->n; i++) {
222     PetscInt     nz = 0, *cols = NULL;
223     PetscScalar *vals = NULL;
224 
225     PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals));
226     PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES));
227     PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals));
228   }
229 
230   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
231   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
232   B->rmap->bs = A->rmap->bs;
233 
234   if (reuse == MAT_INPLACE_MATRIX) {
235     PetscCall(MatHeaderReplace(A, &B));
236   } else {
237     *newmat = B;
238   }
239   PetscFunctionReturn(0);
240 }
241 
242 #include <../src/mat/impls/aij/seq/aij.h>
243 
244 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat)
245 {
246   Mat                B;
247   Mat_SeqAIJ        *a  = (Mat_SeqAIJ *)A->data;
248   PetscInt          *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols;
249   const PetscInt    *cols;
250   const PetscScalar *vals;
251 
252   PetscFunctionBegin;
253 
254   if (reuse == MAT_REUSE_MATRIX) {
255     B = *newmat;
256   } else {
257     if (PetscDefined(USE_DEBUG) || !a->ilen) {
258       PetscCall(PetscMalloc1(m, &rowlengths));
259       for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i];
260     }
261     if (PetscDefined(USE_DEBUG) && a->ilen) {
262       PetscBool eq;
263       PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq));
264       PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect");
265       PetscCall(PetscFree(rowlengths));
266       rowlengths = a->ilen;
267     } else if (a->ilen) rowlengths = a->ilen;
268     PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B));
269     PetscCall(MatSetSizes(B, m, n, m, n));
270     PetscCall(MatSetType(B, MATSEQSELL));
271     PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths));
272     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
273   }
274 
275   for (row = 0; row < m; row++) {
276     PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
277     PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES));
278     PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals));
279   }
280   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
281   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
282   B->rmap->bs = A->rmap->bs;
283 
284   if (reuse == MAT_INPLACE_MATRIX) {
285     PetscCall(MatHeaderReplace(A, &B));
286   } else {
287     *newmat = B;
288   }
289   PetscFunctionReturn(0);
290 }
291 
292 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy)
293 {
294   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
295   PetscScalar       *y;
296   const PetscScalar *x;
297   const MatScalar   *aval        = a->val;
298   PetscInt           totalslices = a->totalslices;
299   const PetscInt    *acolidx     = a->colidx;
300   PetscInt           i, j;
301 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
302   __m512d  vec_x, vec_y, vec_vals;
303   __m256i  vec_idx;
304   __mmask8 mask;
305   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
306   __m256i  vec_idx2, vec_idx3, vec_idx4;
307 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
308   __m128i   vec_idx;
309   __m256d   vec_x, vec_y, vec_y2, vec_vals;
310   MatScalar yval;
311   PetscInt  r, rows_left, row, nnz_in_row;
312 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
313   __m128d   vec_x_tmp;
314   __m256d   vec_x, vec_y, vec_y2, vec_vals;
315   MatScalar yval;
316   PetscInt  r, rows_left, row, nnz_in_row;
317 #else
318   PetscScalar sum[8];
319 #endif
320 
321 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
322   #pragma disjoint(*x, *y, *aval)
323 #endif
324 
325   PetscFunctionBegin;
326   PetscCall(VecGetArrayRead(xx, &x));
327   PetscCall(VecGetArray(yy, &y));
328 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
329   for (i = 0; i < totalslices; i++) { /* loop over slices */
330     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
331     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
332 
333     vec_y  = _mm512_setzero_pd();
334     vec_y2 = _mm512_setzero_pd();
335     vec_y3 = _mm512_setzero_pd();
336     vec_y4 = _mm512_setzero_pd();
337 
338     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice columnn */
339     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
340     case 3:
341       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
342       acolidx += 8;
343       aval += 8;
344       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
345       acolidx += 8;
346       aval += 8;
347       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
348       acolidx += 8;
349       aval += 8;
350       j += 3;
351       break;
352     case 2:
353       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
354       acolidx += 8;
355       aval += 8;
356       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
357       acolidx += 8;
358       aval += 8;
359       j += 2;
360       break;
361     case 1:
362       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
363       acolidx += 8;
364       aval += 8;
365       j += 1;
366       break;
367     }
368   #pragma novector
369     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
370       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
371       acolidx += 8;
372       aval += 8;
373       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
374       acolidx += 8;
375       aval += 8;
376       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
377       acolidx += 8;
378       aval += 8;
379       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
380       acolidx += 8;
381       aval += 8;
382     }
383 
384     vec_y = _mm512_add_pd(vec_y, vec_y2);
385     vec_y = _mm512_add_pd(vec_y, vec_y3);
386     vec_y = _mm512_add_pd(vec_y, vec_y4);
387     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
388       mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
389       _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y);
390     } else {
391       _mm512_storeu_pd(&y[8 * i], vec_y);
392     }
393   }
394 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
395   for (i = 0; i < totalslices; i++) { /* loop over full slices */
396     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
397     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
398 
399     /* last slice may have padding rows. Don't use vectorization. */
400     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
401       rows_left = A->rmap->n - 8 * i;
402       for (r = 0; r < rows_left; ++r) {
403         yval       = (MatScalar)0;
404         row        = 8 * i + r;
405         nnz_in_row = a->rlen[row];
406         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
407         y[row] = yval;
408       }
409       break;
410     }
411 
412     vec_y  = _mm256_setzero_pd();
413     vec_y2 = _mm256_setzero_pd();
414 
415   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
416   #pragma novector
417   #pragma unroll(2)
418     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
419       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
420       aval += 4;
421       acolidx += 4;
422       AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2);
423       aval += 4;
424       acolidx += 4;
425     }
426 
427     _mm256_storeu_pd(y + i * 8, vec_y);
428     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
429   }
430 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
431   for (i = 0; i < totalslices; i++) { /* loop over full slices */
432     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
433     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
434 
435     vec_y  = _mm256_setzero_pd();
436     vec_y2 = _mm256_setzero_pd();
437 
438     /* last slice may have padding rows. Don't use vectorization. */
439     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
440       rows_left = A->rmap->n - 8 * i;
441       for (r = 0; r < rows_left; ++r) {
442         yval       = (MatScalar)0;
443         row        = 8 * i + r;
444         nnz_in_row = a->rlen[row];
445         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
446         y[row] = yval;
447       }
448       break;
449     }
450 
451   /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
452   #pragma novector
453   #pragma unroll(2)
454     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
455       vec_vals  = _mm256_loadu_pd(aval);
456       vec_x_tmp = _mm_setzero_pd();
457       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
458       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
459       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
460       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
461       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
462       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
463       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
464       aval += 4;
465 
466       vec_vals  = _mm256_loadu_pd(aval);
467       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
468       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
469       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
470       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
471       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
472       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
473       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
474       aval += 4;
475     }
476 
477     _mm256_storeu_pd(y + i * 8, vec_y);
478     _mm256_storeu_pd(y + i * 8 + 4, vec_y2);
479   }
480 #else
481   for (i = 0; i < totalslices; i++) { /* loop over slices */
482     for (j = 0; j < 8; j++) sum[j] = 0.0;
483     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
484       sum[0] += aval[j] * x[acolidx[j]];
485       sum[1] += aval[j + 1] * x[acolidx[j + 1]];
486       sum[2] += aval[j + 2] * x[acolidx[j + 2]];
487       sum[3] += aval[j + 3] * x[acolidx[j + 3]];
488       sum[4] += aval[j + 4] * x[acolidx[j + 4]];
489       sum[5] += aval[j + 5] * x[acolidx[j + 5]];
490       sum[6] += aval[j + 6] * x[acolidx[j + 6]];
491       sum[7] += aval[j + 7] * x[acolidx[j + 7]];
492     }
493     if (i == totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
494       for (j = 0; j < (A->rmap->n & 0x07); j++) y[8 * i + j] = sum[j];
495     } else {
496       for (j = 0; j < 8; j++) y[8 * i + j] = sum[j];
497     }
498   }
499 #endif
500 
501   PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */
502   PetscCall(VecRestoreArrayRead(xx, &x));
503   PetscCall(VecRestoreArray(yy, &y));
504   PetscFunctionReturn(0);
505 }
506 
507 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
508 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz)
509 {
510   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
511   PetscScalar       *y, *z;
512   const PetscScalar *x;
513   const MatScalar   *aval        = a->val;
514   PetscInt           totalslices = a->totalslices;
515   const PetscInt    *acolidx     = a->colidx;
516   PetscInt           i, j;
517 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
518   __m512d  vec_x, vec_y, vec_vals;
519   __m256i  vec_idx;
520   __mmask8 mask;
521   __m512d  vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4;
522   __m256i  vec_idx2, vec_idx3, vec_idx4;
523 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
524   __m128d   vec_x_tmp;
525   __m256d   vec_x, vec_y, vec_y2, vec_vals;
526   MatScalar yval;
527   PetscInt  r, row, nnz_in_row;
528 #else
529   PetscScalar sum[8];
530 #endif
531 
532 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
533   #pragma disjoint(*x, *y, *aval)
534 #endif
535 
536   PetscFunctionBegin;
537   PetscCall(VecGetArrayRead(xx, &x));
538   PetscCall(VecGetArrayPair(yy, zz, &y, &z));
539 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
540   for (i = 0; i < totalslices; i++) { /* loop over slices */
541     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
542     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
543 
544     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
545       mask  = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07)));
546       vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]);
547     } else {
548       vec_y = _mm512_loadu_pd(&y[8 * i]);
549     }
550     vec_y2 = _mm512_setzero_pd();
551     vec_y3 = _mm512_setzero_pd();
552     vec_y4 = _mm512_setzero_pd();
553 
554     j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice columnn */
555     switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) {
556     case 3:
557       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
558       acolidx += 8;
559       aval += 8;
560       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
561       acolidx += 8;
562       aval += 8;
563       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
564       acolidx += 8;
565       aval += 8;
566       j += 3;
567       break;
568     case 2:
569       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
570       acolidx += 8;
571       aval += 8;
572       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
573       acolidx += 8;
574       aval += 8;
575       j += 2;
576       break;
577     case 1:
578       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
579       acolidx += 8;
580       aval += 8;
581       j += 1;
582       break;
583     }
584   #pragma novector
585     for (; j < (a->sliidx[i + 1] >> 3); j += 4) {
586       AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y);
587       acolidx += 8;
588       aval += 8;
589       AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2);
590       acolidx += 8;
591       aval += 8;
592       AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3);
593       acolidx += 8;
594       aval += 8;
595       AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4);
596       acolidx += 8;
597       aval += 8;
598     }
599 
600     vec_y = _mm512_add_pd(vec_y, vec_y2);
601     vec_y = _mm512_add_pd(vec_y, vec_y3);
602     vec_y = _mm512_add_pd(vec_y, vec_y4);
603     if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
604       _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y);
605     } else {
606       _mm512_storeu_pd(&z[8 * i], vec_y);
607     }
608   }
609 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
610   for (i = 0; i < totalslices; i++) { /* loop over full slices */
611     PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
612     PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0);
613 
614     /* last slice may have padding rows. Don't use vectorization. */
615     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
616       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
617         row        = 8 * i + r;
618         yval       = (MatScalar)0.0;
619         nnz_in_row = a->rlen[row];
620         for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]];
621         z[row] = y[row] + yval;
622       }
623       break;
624     }
625 
626     vec_y  = _mm256_loadu_pd(y + 8 * i);
627     vec_y2 = _mm256_loadu_pd(y + 8 * i + 4);
628 
629     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
630     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
631       vec_vals  = _mm256_loadu_pd(aval);
632       vec_x_tmp = _mm_setzero_pd();
633       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
634       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
635       vec_x     = _mm256_setzero_pd();
636       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
637       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
638       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
639       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
640       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y);
641       aval += 4;
642 
643       vec_vals  = _mm256_loadu_pd(aval);
644       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
645       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
646       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0);
647       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
648       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
649       vec_x     = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1);
650       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2);
651       aval += 4;
652     }
653 
654     _mm256_storeu_pd(z + i * 8, vec_y);
655     _mm256_storeu_pd(z + i * 8 + 4, vec_y2);
656   }
657 #else
658   for (i = 0; i < totalslices; i++) { /* loop over slices */
659     for (j = 0; j < 8; j++) sum[j] = 0.0;
660     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
661       sum[0] += aval[j] * x[acolidx[j]];
662       sum[1] += aval[j + 1] * x[acolidx[j + 1]];
663       sum[2] += aval[j + 2] * x[acolidx[j + 2]];
664       sum[3] += aval[j + 3] * x[acolidx[j + 3]];
665       sum[4] += aval[j + 4] * x[acolidx[j + 4]];
666       sum[5] += aval[j + 5] * x[acolidx[j + 5]];
667       sum[6] += aval[j + 6] * x[acolidx[j + 6]];
668       sum[7] += aval[j + 7] * x[acolidx[j + 7]];
669     }
670     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
671       for (j = 0; j < (A->rmap->n & 0x07); j++) z[8 * i + j] = y[8 * i + j] + sum[j];
672     } else {
673       for (j = 0; j < 8; j++) z[8 * i + j] = y[8 * i + j] + sum[j];
674     }
675   }
676 #endif
677 
678   PetscCall(PetscLogFlops(2.0 * a->nz));
679   PetscCall(VecRestoreArrayRead(xx, &x));
680   PetscCall(VecRestoreArrayPair(yy, zz, &y, &z));
681   PetscFunctionReturn(0);
682 }
683 
684 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy)
685 {
686   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
687   PetscScalar       *y;
688   const PetscScalar *x;
689   const MatScalar   *aval    = a->val;
690   const PetscInt    *acolidx = a->colidx;
691   PetscInt           i, j, r, row, nnz_in_row, totalslices = a->totalslices;
692 
693 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
694   #pragma disjoint(*x, *y, *aval)
695 #endif
696 
697   PetscFunctionBegin;
698   if (A->symmetric == PETSC_BOOL3_TRUE) {
699     PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy));
700     PetscFunctionReturn(0);
701   }
702   if (zz != yy) PetscCall(VecCopy(zz, yy));
703   PetscCall(VecGetArrayRead(xx, &x));
704   PetscCall(VecGetArray(yy, &y));
705   for (i = 0; i < a->totalslices; i++) { /* loop over slices */
706     if (i == totalslices - 1 && (A->rmap->n & 0x07)) {
707       for (r = 0; r < (A->rmap->n & 0x07); ++r) {
708         row        = 8 * i + r;
709         nnz_in_row = a->rlen[row];
710         for (j = 0; j < nnz_in_row; ++j) y[acolidx[8 * j + r]] += aval[8 * j + r] * x[row];
711       }
712       break;
713     }
714     for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) {
715       y[acolidx[j]] += aval[j] * x[8 * i];
716       y[acolidx[j + 1]] += aval[j + 1] * x[8 * i + 1];
717       y[acolidx[j + 2]] += aval[j + 2] * x[8 * i + 2];
718       y[acolidx[j + 3]] += aval[j + 3] * x[8 * i + 3];
719       y[acolidx[j + 4]] += aval[j + 4] * x[8 * i + 4];
720       y[acolidx[j + 5]] += aval[j + 5] * x[8 * i + 5];
721       y[acolidx[j + 6]] += aval[j + 6] * x[8 * i + 6];
722       y[acolidx[j + 7]] += aval[j + 7] * x[8 * i + 7];
723     }
724   }
725   PetscCall(PetscLogFlops(2.0 * a->sliidx[a->totalslices]));
726   PetscCall(VecRestoreArrayRead(xx, &x));
727   PetscCall(VecRestoreArray(yy, &y));
728   PetscFunctionReturn(0);
729 }
730 
731 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy)
732 {
733   PetscFunctionBegin;
734   if (A->symmetric == PETSC_BOOL3_TRUE) {
735     PetscCall(MatMult_SeqSELL(A, xx, yy));
736   } else {
737     PetscCall(VecSet(yy, 0.0));
738     PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy));
739   }
740   PetscFunctionReturn(0);
741 }
742 
743 /*
744      Checks for missing diagonals
745 */
746 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d)
747 {
748   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
749   PetscInt    *diag, i;
750 
751   PetscFunctionBegin;
752   *missing = PETSC_FALSE;
753   if (A->rmap->n > 0 && !(a->colidx)) {
754     *missing = PETSC_TRUE;
755     if (d) *d = 0;
756     PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n"));
757   } else {
758     diag = a->diag;
759     for (i = 0; i < A->rmap->n; i++) {
760       if (diag[i] == -1) {
761         *missing = PETSC_TRUE;
762         if (d) *d = i;
763         PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i));
764         break;
765       }
766     }
767   }
768   PetscFunctionReturn(0);
769 }
770 
771 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
772 {
773   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
774   PetscInt     i, j, m = A->rmap->n, shift;
775 
776   PetscFunctionBegin;
777   if (!a->diag) {
778     PetscCall(PetscMalloc1(m, &a->diag));
779     a->free_diag = PETSC_TRUE;
780   }
781   for (i = 0; i < m; i++) {                      /* loop over rows */
782     shift      = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
783     a->diag[i] = -1;
784     for (j = 0; j < a->rlen[i]; j++) {
785       if (a->colidx[shift + j * 8] == i) {
786         a->diag[i] = shift + j * 8;
787         break;
788       }
789     }
790   }
791   PetscFunctionReturn(0);
792 }
793 
794 /*
795   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
796 */
797 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift)
798 {
799   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
800   PetscInt     i, *diag, m = A->rmap->n;
801   MatScalar   *val = a->val;
802   PetscScalar *idiag, *mdiag;
803 
804   PetscFunctionBegin;
805   if (a->idiagvalid) PetscFunctionReturn(0);
806   PetscCall(MatMarkDiagonal_SeqSELL(A));
807   diag = a->diag;
808   if (!a->idiag) {
809     PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work));
810     val = a->val;
811   }
812   mdiag = a->mdiag;
813   idiag = a->idiag;
814 
815   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
816     for (i = 0; i < m; i++) {
817       mdiag[i] = val[diag[i]];
818       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
819         PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i);
820         PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i));
821         A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
822         A->factorerror_zeropivot_value = 0.0;
823         A->factorerror_zeropivot_row   = i;
824       }
825       idiag[i] = 1.0 / val[diag[i]];
826     }
827     PetscCall(PetscLogFlops(m));
828   } else {
829     for (i = 0; i < m; i++) {
830       mdiag[i] = val[diag[i]];
831       idiag[i] = omega / (fshift + val[diag[i]]);
832     }
833     PetscCall(PetscLogFlops(2.0 * m));
834   }
835   a->idiagvalid = PETSC_TRUE;
836   PetscFunctionReturn(0);
837 }
838 
839 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
840 {
841   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
842 
843   PetscFunctionBegin;
844   PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices]));
845   PetscCall(MatSeqSELLInvalidateDiagonal(A));
846   PetscFunctionReturn(0);
847 }
848 
849 PetscErrorCode MatDestroy_SeqSELL(Mat A)
850 {
851   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
852 
853   PetscFunctionBegin;
854 #if defined(PETSC_USE_LOG)
855   PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz);
856 #endif
857   PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx));
858   PetscCall(ISDestroy(&a->row));
859   PetscCall(ISDestroy(&a->col));
860   PetscCall(PetscFree(a->diag));
861   PetscCall(PetscFree(a->rlen));
862   PetscCall(PetscFree(a->sliidx));
863   PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work));
864   PetscCall(PetscFree(a->solve_work));
865   PetscCall(ISDestroy(&a->icol));
866   PetscCall(PetscFree(a->saved_values));
867   PetscCall(PetscFree2(a->getrowcols, a->getrowvals));
868 
869   PetscCall(PetscFree(A->data));
870 
871   PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL));
872   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL));
873   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL));
874   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL));
875   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL));
876   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL));
877   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL));
878   PetscFunctionReturn(0);
879 }
880 
881 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg)
882 {
883   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
884 
885   PetscFunctionBegin;
886   switch (op) {
887   case MAT_ROW_ORIENTED:
888     a->roworiented = flg;
889     break;
890   case MAT_KEEP_NONZERO_PATTERN:
891     a->keepnonzeropattern = flg;
892     break;
893   case MAT_NEW_NONZERO_LOCATIONS:
894     a->nonew = (flg ? 0 : 1);
895     break;
896   case MAT_NEW_NONZERO_LOCATION_ERR:
897     a->nonew = (flg ? -1 : 0);
898     break;
899   case MAT_NEW_NONZERO_ALLOCATION_ERR:
900     a->nonew = (flg ? -2 : 0);
901     break;
902   case MAT_UNUSED_NONZERO_LOCATION_ERR:
903     a->nounused = (flg ? -1 : 0);
904     break;
905   case MAT_FORCE_DIAGONAL_ENTRIES:
906   case MAT_IGNORE_OFF_PROC_ENTRIES:
907   case MAT_USE_HASH_TABLE:
908   case MAT_SORTED_FULL:
909     PetscCall(PetscInfo(A, "Option %s ignored\n", MatOptions[op]));
910     break;
911   case MAT_SPD:
912   case MAT_SYMMETRIC:
913   case MAT_STRUCTURALLY_SYMMETRIC:
914   case MAT_HERMITIAN:
915   case MAT_SYMMETRY_ETERNAL:
916   case MAT_STRUCTURAL_SYMMETRY_ETERNAL:
917   case MAT_SPD_ETERNAL:
918     /* These options are handled directly by MatSetOption() */
919     break;
920   default:
921     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unknown option %d", op);
922   }
923   PetscFunctionReturn(0);
924 }
925 
926 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v)
927 {
928   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
929   PetscInt     i, j, n, shift;
930   PetscScalar *x, zero = 0.0;
931 
932   PetscFunctionBegin;
933   PetscCall(VecGetLocalSize(v, &n));
934   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
935 
936   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
937     PetscInt *diag = a->diag;
938     PetscCall(VecGetArray(v, &x));
939     for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]];
940     PetscCall(VecRestoreArray(v, &x));
941     PetscFunctionReturn(0);
942   }
943 
944   PetscCall(VecSet(v, zero));
945   PetscCall(VecGetArray(v, &x));
946   for (i = 0; i < n; i++) {                 /* loop over rows */
947     shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
948     x[i]  = 0;
949     for (j = 0; j < a->rlen[i]; j++) {
950       if (a->colidx[shift + j * 8] == i) {
951         x[i] = a->val[shift + j * 8];
952         break;
953       }
954     }
955   }
956   PetscCall(VecRestoreArray(v, &x));
957   PetscFunctionReturn(0);
958 }
959 
960 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr)
961 {
962   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
963   const PetscScalar *l, *r;
964   PetscInt           i, j, m, n, row;
965 
966   PetscFunctionBegin;
967   if (ll) {
968     /* The local size is used so that VecMPI can be passed to this routine
969        by MatDiagonalScale_MPISELL */
970     PetscCall(VecGetLocalSize(ll, &m));
971     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
972     PetscCall(VecGetArrayRead(ll, &l));
973     for (i = 0; i < a->totalslices; i++) {                  /* loop over slices */
974       if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
975         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
976           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8 * i + row];
977         }
978       } else {
979         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) a->val[j] *= l[8 * i + row];
980       }
981     }
982     PetscCall(VecRestoreArrayRead(ll, &l));
983     PetscCall(PetscLogFlops(a->nz));
984   }
985   if (rr) {
986     PetscCall(VecGetLocalSize(rr, &n));
987     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
988     PetscCall(VecGetArrayRead(rr, &r));
989     for (i = 0; i < a->totalslices; i++) {                  /* loop over slices */
990       if (i == a->totalslices - 1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
991         for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
992           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
993         }
994       } else {
995         for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]];
996       }
997     }
998     PetscCall(VecRestoreArrayRead(rr, &r));
999     PetscCall(PetscLogFlops(a->nz));
1000   }
1001   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1002   PetscFunctionReturn(0);
1003 }
1004 
1005 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[])
1006 {
1007   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1008   PetscInt    *cp, i, k, low, high, t, row, col, l;
1009   PetscInt     shift;
1010   MatScalar   *vp;
1011 
1012   PetscFunctionBegin;
1013   for (k = 0; k < m; k++) { /* loop over requested rows */
1014     row = im[k];
1015     if (row < 0) continue;
1016     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1017     shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1018     cp    = a->colidx + shift;                  /* pointer to the row */
1019     vp    = a->val + shift;                     /* pointer to the row */
1020     for (l = 0; l < n; l++) {                   /* loop over requested columns */
1021       col = in[l];
1022       if (col < 0) continue;
1023       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1024       high = a->rlen[row];
1025       low  = 0; /* assume unsorted */
1026       while (high - low > 5) {
1027         t = (low + high) / 2;
1028         if (*(cp + t * 8) > col) high = t;
1029         else low = t;
1030       }
1031       for (i = low; i < high; i++) {
1032         if (*(cp + 8 * i) > col) break;
1033         if (*(cp + 8 * i) == col) {
1034           *v++ = *(vp + 8 * i);
1035           goto finished;
1036         }
1037       }
1038       *v++ = 0.0;
1039     finished:;
1040     }
1041   }
1042   PetscFunctionReturn(0);
1043 }
1044 
1045 PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer)
1046 {
1047   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1048   PetscInt          i, j, m = A->rmap->n, shift;
1049   const char       *name;
1050   PetscViewerFormat format;
1051 
1052   PetscFunctionBegin;
1053   PetscCall(PetscViewerGetFormat(viewer, &format));
1054   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1055     PetscInt nofinalvalue = 0;
1056     /*
1057     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1058       nofinalvalue = 1;
1059     }
1060     */
1061     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1062     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n));
1063     PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz));
1064 #if defined(PETSC_USE_COMPLEX)
1065     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue));
1066 #else
1067     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue));
1068 #endif
1069     PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n"));
1070 
1071     for (i = 0; i < m; i++) {
1072       shift = a->sliidx[i >> 3] + (i & 0x07);
1073       for (j = 0; j < a->rlen[i]; j++) {
1074 #if defined(PETSC_USE_COMPLEX)
1075         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1076 #else
1077         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n", i + 1, a->colidx[shift + 8 * j] + 1, (double)a->val[shift + 8 * j]));
1078 #endif
1079       }
1080     }
1081     /*
1082     if (nofinalvalue) {
1083 #if defined(PETSC_USE_COMPLEX)
1084       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1085 #else
1086       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1087 #endif
1088     }
1089     */
1090     PetscCall(PetscObjectGetName((PetscObject)A, &name));
1091     PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name));
1092     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1093   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1094     PetscFunctionReturn(0);
1095   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1096     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1097     for (i = 0; i < m; i++) {
1098       PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1099       shift = a->sliidx[i >> 3] + (i & 0x07);
1100       for (j = 0; j < a->rlen[i]; j++) {
1101 #if defined(PETSC_USE_COMPLEX)
1102         if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1103           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1104         } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0 && PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1105           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1106         } else if (PetscRealPart(a->val[shift + 8 * j]) != 0.0) {
1107           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1108         }
1109 #else
1110         if (a->val[shift + 8 * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1111 #endif
1112       }
1113       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1114     }
1115     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1116   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1117     PetscInt    cnt = 0, jcnt;
1118     PetscScalar value;
1119 #if defined(PETSC_USE_COMPLEX)
1120     PetscBool realonly = PETSC_TRUE;
1121     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1122       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1123         realonly = PETSC_FALSE;
1124         break;
1125       }
1126     }
1127 #endif
1128 
1129     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1130     for (i = 0; i < m; i++) {
1131       jcnt  = 0;
1132       shift = a->sliidx[i >> 3] + (i & 0x07);
1133       for (j = 0; j < A->cmap->n; j++) {
1134         if (jcnt < a->rlen[i] && j == a->colidx[shift + 8 * j]) {
1135           value = a->val[cnt++];
1136           jcnt++;
1137         } else {
1138           value = 0.0;
1139         }
1140 #if defined(PETSC_USE_COMPLEX)
1141         if (realonly) {
1142           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value)));
1143         } else {
1144           PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value)));
1145         }
1146 #else
1147         PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value));
1148 #endif
1149       }
1150       PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1151     }
1152     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1153   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1154     PetscInt fshift = 1;
1155     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1156 #if defined(PETSC_USE_COMPLEX)
1157     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n"));
1158 #else
1159     PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n"));
1160 #endif
1161     PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1162     for (i = 0; i < m; i++) {
1163       shift = a->sliidx[i >> 3] + (i & 0x07);
1164       for (j = 0; j < a->rlen[i]; j++) {
1165 #if defined(PETSC_USE_COMPLEX)
1166         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1167 #else
1168         PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + 8 * j] + fshift, (double)a->val[shift + 8 * j]));
1169 #endif
1170       }
1171     }
1172     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1173   } else if (format == PETSC_VIEWER_NATIVE) {
1174     for (i = 0; i < a->totalslices; i++) { /* loop over slices */
1175       PetscInt row;
1176       PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1]));
1177       for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) & 0x07)) {
1178 #if defined(PETSC_USE_COMPLEX)
1179         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1180           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1181         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1182           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j])));
1183         } else {
1184           PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)PetscRealPart(a->val[j])));
1185         }
1186 #else
1187         PetscCall(PetscViewerASCIIPrintf(viewer, "  %" PetscInt_FMT " %" PetscInt_FMT " %g\n", 8 * i + row, a->colidx[j], (double)a->val[j]));
1188 #endif
1189       }
1190     }
1191   } else {
1192     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE));
1193     if (A->factortype) {
1194       for (i = 0; i < m; i++) {
1195         shift = a->sliidx[i >> 3] + (i & 0x07);
1196         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1197         /* L part */
1198         for (j = shift; j < a->diag[i]; j += 8) {
1199 #if defined(PETSC_USE_COMPLEX)
1200           if (PetscImaginaryPart(a->val[shift + 8 * j]) > 0.0) {
1201             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1202           } else if (PetscImaginaryPart(a->val[shift + 8 * j]) < 0.0) {
1203             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1204           } else {
1205             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1206           }
1207 #else
1208           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1209 #endif
1210         }
1211         /* diagonal */
1212         j = a->diag[i];
1213 #if defined(PETSC_USE_COMPLEX)
1214         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1215           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j])));
1216         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1217           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j]))));
1218         } else {
1219           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j])));
1220         }
1221 #else
1222         PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1.0 / a->val[j])));
1223 #endif
1224 
1225         /* U part */
1226         for (j = a->diag[i] + 1; j < shift + 8 * a->rlen[i]; j += 8) {
1227 #if defined(PETSC_USE_COMPLEX)
1228           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1229             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j])));
1230           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1231             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j]))));
1232           } else {
1233             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j])));
1234           }
1235 #else
1236           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j]));
1237 #endif
1238         }
1239         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1240       }
1241     } else {
1242       for (i = 0; i < m; i++) {
1243         shift = a->sliidx[i >> 3] + (i & 0x07);
1244         PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i));
1245         for (j = 0; j < a->rlen[i]; j++) {
1246 #if defined(PETSC_USE_COMPLEX)
1247           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1248             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)PetscImaginaryPart(a->val[shift + 8 * j])));
1249           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1250             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j]), (double)-PetscImaginaryPart(a->val[shift + 8 * j])));
1251           } else {
1252             PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)PetscRealPart(a->val[shift + 8 * j])));
1253           }
1254 #else
1255           PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + 8 * j], (double)a->val[shift + 8 * j]));
1256 #endif
1257         }
1258         PetscCall(PetscViewerASCIIPrintf(viewer, "\n"));
1259       }
1260     }
1261     PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE));
1262   }
1263   PetscCall(PetscViewerFlush(viewer));
1264   PetscFunctionReturn(0);
1265 }
1266 
1267 #include <petscdraw.h>
1268 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa)
1269 {
1270   Mat               A = (Mat)Aa;
1271   Mat_SeqSELL      *a = (Mat_SeqSELL *)A->data;
1272   PetscInt          i, j, m = A->rmap->n, shift;
1273   int               color;
1274   PetscReal         xl, yl, xr, yr, x_l, x_r, y_l, y_r;
1275   PetscViewer       viewer;
1276   PetscViewerFormat format;
1277 
1278   PetscFunctionBegin;
1279   PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer));
1280   PetscCall(PetscViewerGetFormat(viewer, &format));
1281   PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr));
1282 
1283   /* loop over matrix elements drawing boxes */
1284 
1285   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1286     PetscDrawCollectiveBegin(draw);
1287     /* Blue for negative, Cyan for zero and  Red for positive */
1288     color = PETSC_DRAW_BLUE;
1289     for (i = 0; i < m; i++) {
1290       shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1291       y_l   = m - i - 1.0;
1292       y_r   = y_l + 1.0;
1293       for (j = 0; j < a->rlen[i]; j++) {
1294         x_l = a->colidx[shift + j * 8];
1295         x_r = x_l + 1.0;
1296         if (PetscRealPart(a->val[shift + 8 * j]) >= 0.) continue;
1297         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1298       }
1299     }
1300     color = PETSC_DRAW_CYAN;
1301     for (i = 0; i < m; i++) {
1302       shift = a->sliidx[i >> 3] + (i & 0x07);
1303       y_l   = m - i - 1.0;
1304       y_r   = y_l + 1.0;
1305       for (j = 0; j < a->rlen[i]; j++) {
1306         x_l = a->colidx[shift + j * 8];
1307         x_r = x_l + 1.0;
1308         if (a->val[shift + 8 * j] != 0.) continue;
1309         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1310       }
1311     }
1312     color = PETSC_DRAW_RED;
1313     for (i = 0; i < m; i++) {
1314       shift = a->sliidx[i >> 3] + (i & 0x07);
1315       y_l   = m - i - 1.0;
1316       y_r   = y_l + 1.0;
1317       for (j = 0; j < a->rlen[i]; j++) {
1318         x_l = a->colidx[shift + j * 8];
1319         x_r = x_l + 1.0;
1320         if (PetscRealPart(a->val[shift + 8 * j]) <= 0.) continue;
1321         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1322       }
1323     }
1324     PetscDrawCollectiveEnd(draw);
1325   } else {
1326     /* use contour shading to indicate magnitude of values */
1327     /* first determine max of all nonzero values */
1328     PetscReal minv = 0.0, maxv = 0.0;
1329     PetscInt  count = 0;
1330     PetscDraw popup;
1331     for (i = 0; i < a->sliidx[a->totalslices]; i++) {
1332       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1333     }
1334     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1335     PetscCall(PetscDrawGetPopup(draw, &popup));
1336     PetscCall(PetscDrawScalePopup(popup, minv, maxv));
1337 
1338     PetscDrawCollectiveBegin(draw);
1339     for (i = 0; i < m; i++) {
1340       shift = a->sliidx[i >> 3] + (i & 0x07);
1341       y_l   = m - i - 1.0;
1342       y_r   = y_l + 1.0;
1343       for (j = 0; j < a->rlen[i]; j++) {
1344         x_l   = a->colidx[shift + j * 8];
1345         x_r   = x_l + 1.0;
1346         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv);
1347         PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color));
1348         count++;
1349       }
1350     }
1351     PetscDrawCollectiveEnd(draw);
1352   }
1353   PetscFunctionReturn(0);
1354 }
1355 
1356 #include <petscdraw.h>
1357 PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer)
1358 {
1359   PetscDraw draw;
1360   PetscReal xr, yr, xl, yl, h, w;
1361   PetscBool isnull;
1362 
1363   PetscFunctionBegin;
1364   PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw));
1365   PetscCall(PetscDrawIsNull(draw, &isnull));
1366   if (isnull) PetscFunctionReturn(0);
1367 
1368   xr = A->cmap->n;
1369   yr = A->rmap->n;
1370   h  = yr / 10.0;
1371   w  = xr / 10.0;
1372   xr += w;
1373   yr += h;
1374   xl = -w;
1375   yl = -h;
1376   PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr));
1377   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer));
1378   PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A));
1379   PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL));
1380   PetscCall(PetscDrawSave(draw));
1381   PetscFunctionReturn(0);
1382 }
1383 
1384 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer)
1385 {
1386   PetscBool iascii, isbinary, isdraw;
1387 
1388   PetscFunctionBegin;
1389   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1390   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary));
1391   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw));
1392   if (iascii) {
1393     PetscCall(MatView_SeqSELL_ASCII(A, viewer));
1394   } else if (isbinary) {
1395     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1396   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer));
1397   PetscFunctionReturn(0);
1398 }
1399 
1400 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode)
1401 {
1402   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1403   PetscInt     i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k;
1404   MatScalar   *vp;
1405 
1406   PetscFunctionBegin;
1407   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1408   /* To do: compress out the unused elements */
1409   PetscCall(MatMarkDiagonal_SeqSELL(A));
1410   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz));
1411   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs));
1412   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax));
1413   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1414   for (i = 0; i < a->totalslices; ++i) {
1415     shift = a->sliidx[i];                                      /* starting index of the slice */
1416     cp    = a->colidx + shift;                                 /* pointer to the column indices of the slice */
1417     vp    = a->val + shift;                                    /* pointer to the nonzero values of the slice */
1418     for (row_in_slice = 0; row_in_slice < 8; ++row_in_slice) { /* loop over rows in the slice */
1419       row  = 8 * i + row_in_slice;
1420       nrow = a->rlen[row]; /* number of nonzeros in row */
1421       /*
1422         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1423         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1424       */
1425       lastcol = 0;
1426       if (nrow > 0) {                                /* nonempty row */
1427         lastcol = cp[8 * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */
1428       } else if (!row_in_slice) {                    /* first row of the currect slice is empty */
1429         for (j = 1; j < 8; j++) {
1430           if (a->rlen[8 * i + j]) {
1431             lastcol = cp[j];
1432             break;
1433           }
1434         }
1435       } else {
1436         if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */
1437       }
1438 
1439       for (k = nrow; k < (a->sliidx[i + 1] - shift) / 8; ++k) {
1440         cp[8 * k + row_in_slice] = lastcol;
1441         vp[8 * k + row_in_slice] = (MatScalar)0;
1442       }
1443     }
1444   }
1445 
1446   A->info.mallocs += a->reallocs;
1447   a->reallocs = 0;
1448 
1449   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1450   PetscFunctionReturn(0);
1451 }
1452 
1453 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info)
1454 {
1455   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1456 
1457   PetscFunctionBegin;
1458   info->block_size   = 1.0;
1459   info->nz_allocated = a->maxallocmat;
1460   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1461   info->nz_unneeded  = (a->maxallocmat - a->sliidx[a->totalslices]);
1462   info->assemblies   = A->num_ass;
1463   info->mallocs      = A->info.mallocs;
1464   info->memory       = 0; /* REVIEW ME */
1465   if (A->factortype) {
1466     info->fill_ratio_given  = A->info.fill_ratio_given;
1467     info->fill_ratio_needed = A->info.fill_ratio_needed;
1468     info->factor_mallocs    = A->info.factor_mallocs;
1469   } else {
1470     info->fill_ratio_given  = 0;
1471     info->fill_ratio_needed = 0;
1472     info->factor_mallocs    = 0;
1473   }
1474   PetscFunctionReturn(0);
1475 }
1476 
1477 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is)
1478 {
1479   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1480   PetscInt     shift, i, k, l, low, high, t, ii, row, col, nrow;
1481   PetscInt    *cp, nonew = a->nonew, lastcol = -1;
1482   MatScalar   *vp, value;
1483 
1484   PetscFunctionBegin;
1485   for (k = 0; k < m; k++) { /* loop over added rows */
1486     row = im[k];
1487     if (row < 0) continue;
1488     PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1);
1489     shift = a->sliidx[row >> 3] + (row & 0x07); /* starting index of the row */
1490     cp    = a->colidx + shift;                  /* pointer to the row */
1491     vp    = a->val + shift;                     /* pointer to the row */
1492     nrow  = a->rlen[row];
1493     low   = 0;
1494     high  = nrow;
1495 
1496     for (l = 0; l < n; l++) { /* loop over added columns */
1497       col = in[l];
1498       if (col < 0) continue;
1499       PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1);
1500       if (a->roworiented) {
1501         value = v[l + k * n];
1502       } else {
1503         value = v[k + l * m];
1504       }
1505       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1506 
1507       /* search in this row for the specified column, i indicates the column to be set */
1508       if (col <= lastcol) low = 0;
1509       else high = nrow;
1510       lastcol = col;
1511       while (high - low > 5) {
1512         t = (low + high) / 2;
1513         if (*(cp + t * 8) > col) high = t;
1514         else low = t;
1515       }
1516       for (i = low; i < high; i++) {
1517         if (*(cp + i * 8) > col) break;
1518         if (*(cp + i * 8) == col) {
1519           if (is == ADD_VALUES) *(vp + i * 8) += value;
1520           else *(vp + i * 8) = value;
1521           low = i + 1;
1522           goto noinsert;
1523         }
1524       }
1525       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1526       if (nonew == 1) goto noinsert;
1527       PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1528       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1529       MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, row / 8, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar);
1530       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1531       for (ii = nrow - 1; ii >= i; ii--) {
1532         *(cp + (ii + 1) * 8) = *(cp + ii * 8);
1533         *(vp + (ii + 1) * 8) = *(vp + ii * 8);
1534       }
1535       a->rlen[row]++;
1536       *(cp + i * 8) = col;
1537       *(vp + i * 8) = value;
1538       a->nz++;
1539       A->nonzerostate++;
1540       low = i + 1;
1541       high++;
1542       nrow++;
1543     noinsert:;
1544     }
1545     a->rlen[row] = nrow;
1546   }
1547   PetscFunctionReturn(0);
1548 }
1549 
1550 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str)
1551 {
1552   PetscFunctionBegin;
1553   /* If the two matrices have the same copy implementation, use fast copy. */
1554   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1555     Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1556     Mat_SeqSELL *b = (Mat_SeqSELL *)B->data;
1557 
1558     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different");
1559     PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices]));
1560   } else {
1561     PetscCall(MatCopy_Basic(A, B, str));
1562   }
1563   PetscFunctionReturn(0);
1564 }
1565 
1566 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1567 {
1568   PetscFunctionBegin;
1569   PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL));
1570   PetscFunctionReturn(0);
1571 }
1572 
1573 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[])
1574 {
1575   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1576 
1577   PetscFunctionBegin;
1578   *array = a->val;
1579   PetscFunctionReturn(0);
1580 }
1581 
1582 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[])
1583 {
1584   PetscFunctionBegin;
1585   PetscFunctionReturn(0);
1586 }
1587 
1588 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1589 {
1590   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1591   PetscInt     i;
1592   MatScalar   *aval = a->val;
1593 
1594   PetscFunctionBegin;
1595   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscRealPart(aval[i]);
1596   PetscFunctionReturn(0);
1597 }
1598 
1599 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1600 {
1601   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
1602   PetscInt     i;
1603   MatScalar   *aval = a->val;
1604 
1605   PetscFunctionBegin;
1606   for (i = 0; i < a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1607   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1608   PetscFunctionReturn(0);
1609 }
1610 
1611 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha)
1612 {
1613   Mat_SeqSELL *a      = (Mat_SeqSELL *)inA->data;
1614   MatScalar   *aval   = a->val;
1615   PetscScalar  oalpha = alpha;
1616   PetscBLASInt one    = 1, size;
1617 
1618   PetscFunctionBegin;
1619   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size));
1620   PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one));
1621   PetscCall(PetscLogFlops(a->nz));
1622   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1623   PetscFunctionReturn(0);
1624 }
1625 
1626 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a)
1627 {
1628   Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data;
1629 
1630   PetscFunctionBegin;
1631   if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL));
1632   PetscCall(MatShift_Basic(Y, a));
1633   PetscFunctionReturn(0);
1634 }
1635 
1636 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
1637 {
1638   Mat_SeqSELL       *a = (Mat_SeqSELL *)A->data;
1639   PetscScalar       *x, sum, *t;
1640   const MatScalar   *idiag = NULL, *mdiag;
1641   const PetscScalar *b, *xb;
1642   PetscInt           n, m = A->rmap->n, i, j, shift;
1643   const PetscInt    *diag;
1644 
1645   PetscFunctionBegin;
1646   its = its * lits;
1647 
1648   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1649   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift));
1650   a->fshift = fshift;
1651   a->omega  = omega;
1652 
1653   diag  = a->diag;
1654   t     = a->ssor_work;
1655   idiag = a->idiag;
1656   mdiag = a->mdiag;
1657 
1658   PetscCall(VecGetArray(xx, &x));
1659   PetscCall(VecGetArrayRead(bb, &b));
1660   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1661   PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented");
1662   PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented");
1663   PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
1664 
1665   if (flag & SOR_ZERO_INITIAL_GUESS) {
1666     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1667       for (i = 0; i < m; i++) {
1668         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1669         sum   = b[i];
1670         n     = (diag[i] - shift) / 8;
1671         for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1672         t[i] = sum;
1673         x[i] = sum * idiag[i];
1674       }
1675       xb = t;
1676       PetscCall(PetscLogFlops(a->nz));
1677     } else xb = b;
1678     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1679       for (i = m - 1; i >= 0; i--) {
1680         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1681         sum   = xb[i];
1682         n     = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1683         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1684         if (xb == b) {
1685           x[i] = sum * idiag[i];
1686         } else {
1687           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1688         }
1689       }
1690       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1691     }
1692     its--;
1693   }
1694   while (its--) {
1695     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1696       for (i = 0; i < m; i++) {
1697         /* lower */
1698         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1699         sum   = b[i];
1700         n     = (diag[i] - shift) / 8;
1701         for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1702         t[i] = sum; /* save application of the lower-triangular part */
1703         /* upper */
1704         n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1705         for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1706         x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1707       }
1708       xb = t;
1709       PetscCall(PetscLogFlops(2.0 * a->nz));
1710     } else xb = b;
1711     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1712       for (i = m - 1; i >= 0; i--) {
1713         shift = a->sliidx[i >> 3] + (i & 0x07); /* starting index of the row i */
1714         sum   = xb[i];
1715         if (xb == b) {
1716           /* whole matrix (no checkpointing available) */
1717           n = a->rlen[i];
1718           for (j = 0; j < n; j++) sum -= a->val[shift + j * 8] * x[a->colidx[shift + j * 8]];
1719           x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i];
1720         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1721           n = a->rlen[i] - (diag[i] - shift) / 8 - 1;
1722           for (j = 1; j <= n; j++) sum -= a->val[diag[i] + j * 8] * x[a->colidx[diag[i] + j * 8]];
1723           x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */
1724         }
1725       }
1726       if (xb == b) {
1727         PetscCall(PetscLogFlops(2.0 * a->nz));
1728       } else {
1729         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1730       }
1731     }
1732   }
1733   PetscCall(VecRestoreArray(xx, &x));
1734   PetscCall(VecRestoreArrayRead(bb, &b));
1735   PetscFunctionReturn(0);
1736 }
1737 
1738 /* -------------------------------------------------------------------*/
1739 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1740                                        MatGetRow_SeqSELL,
1741                                        MatRestoreRow_SeqSELL,
1742                                        MatMult_SeqSELL,
1743                                        /* 4*/ MatMultAdd_SeqSELL,
1744                                        MatMultTranspose_SeqSELL,
1745                                        MatMultTransposeAdd_SeqSELL,
1746                                        NULL,
1747                                        NULL,
1748                                        NULL,
1749                                        /* 10*/ NULL,
1750                                        NULL,
1751                                        NULL,
1752                                        MatSOR_SeqSELL,
1753                                        NULL,
1754                                        /* 15*/ MatGetInfo_SeqSELL,
1755                                        MatEqual_SeqSELL,
1756                                        MatGetDiagonal_SeqSELL,
1757                                        MatDiagonalScale_SeqSELL,
1758                                        NULL,
1759                                        /* 20*/ NULL,
1760                                        MatAssemblyEnd_SeqSELL,
1761                                        MatSetOption_SeqSELL,
1762                                        MatZeroEntries_SeqSELL,
1763                                        /* 24*/ NULL,
1764                                        NULL,
1765                                        NULL,
1766                                        NULL,
1767                                        NULL,
1768                                        /* 29*/ MatSetUp_SeqSELL,
1769                                        NULL,
1770                                        NULL,
1771                                        NULL,
1772                                        NULL,
1773                                        /* 34*/ MatDuplicate_SeqSELL,
1774                                        NULL,
1775                                        NULL,
1776                                        NULL,
1777                                        NULL,
1778                                        /* 39*/ NULL,
1779                                        NULL,
1780                                        NULL,
1781                                        MatGetValues_SeqSELL,
1782                                        MatCopy_SeqSELL,
1783                                        /* 44*/ NULL,
1784                                        MatScale_SeqSELL,
1785                                        MatShift_SeqSELL,
1786                                        NULL,
1787                                        NULL,
1788                                        /* 49*/ NULL,
1789                                        NULL,
1790                                        NULL,
1791                                        NULL,
1792                                        NULL,
1793                                        /* 54*/ MatFDColoringCreate_SeqXAIJ,
1794                                        NULL,
1795                                        NULL,
1796                                        NULL,
1797                                        NULL,
1798                                        /* 59*/ NULL,
1799                                        MatDestroy_SeqSELL,
1800                                        MatView_SeqSELL,
1801                                        NULL,
1802                                        NULL,
1803                                        /* 64*/ NULL,
1804                                        NULL,
1805                                        NULL,
1806                                        NULL,
1807                                        NULL,
1808                                        /* 69*/ NULL,
1809                                        NULL,
1810                                        NULL,
1811                                        NULL,
1812                                        NULL,
1813                                        /* 74*/ NULL,
1814                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1815                                        NULL,
1816                                        NULL,
1817                                        NULL,
1818                                        /* 79*/ NULL,
1819                                        NULL,
1820                                        NULL,
1821                                        NULL,
1822                                        NULL,
1823                                        /* 84*/ NULL,
1824                                        NULL,
1825                                        NULL,
1826                                        NULL,
1827                                        NULL,
1828                                        /* 89*/ NULL,
1829                                        NULL,
1830                                        NULL,
1831                                        NULL,
1832                                        NULL,
1833                                        /* 94*/ NULL,
1834                                        NULL,
1835                                        NULL,
1836                                        NULL,
1837                                        NULL,
1838                                        /* 99*/ NULL,
1839                                        NULL,
1840                                        NULL,
1841                                        MatConjugate_SeqSELL,
1842                                        NULL,
1843                                        /*104*/ NULL,
1844                                        NULL,
1845                                        NULL,
1846                                        NULL,
1847                                        NULL,
1848                                        /*109*/ NULL,
1849                                        NULL,
1850                                        NULL,
1851                                        NULL,
1852                                        MatMissingDiagonal_SeqSELL,
1853                                        /*114*/ NULL,
1854                                        NULL,
1855                                        NULL,
1856                                        NULL,
1857                                        NULL,
1858                                        /*119*/ NULL,
1859                                        NULL,
1860                                        NULL,
1861                                        NULL,
1862                                        NULL,
1863                                        /*124*/ NULL,
1864                                        NULL,
1865                                        NULL,
1866                                        NULL,
1867                                        NULL,
1868                                        /*129*/ NULL,
1869                                        NULL,
1870                                        NULL,
1871                                        NULL,
1872                                        NULL,
1873                                        /*134*/ NULL,
1874                                        NULL,
1875                                        NULL,
1876                                        NULL,
1877                                        NULL,
1878                                        /*139*/ NULL,
1879                                        NULL,
1880                                        NULL,
1881                                        MatFDColoringSetUp_SeqXAIJ,
1882                                        NULL,
1883                                        /*144*/ NULL,
1884                                        NULL,
1885                                        NULL,
1886                                        NULL,
1887                                        NULL,
1888                                        NULL,
1889                                        /*150*/ NULL,
1890                                        NULL};
1891 
1892 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1893 {
1894   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1895 
1896   PetscFunctionBegin;
1897   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1898 
1899   /* allocate space for values if not already there */
1900   if (!a->saved_values) { PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); }
1901 
1902   /* copy values over */
1903   PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices]));
1904   PetscFunctionReturn(0);
1905 }
1906 
1907 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1908 {
1909   Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data;
1910 
1911   PetscFunctionBegin;
1912   PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1913   PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first");
1914   PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices]));
1915   PetscFunctionReturn(0);
1916 }
1917 
1918 /*@C
1919  MatSeqSELLRestoreArray - returns access to the array where the data for a `MATSEQSELL` matrix is stored obtained by `MatSeqSELLGetArray()`
1920 
1921  Not Collective
1922 
1923  Input Parameters:
1924  .  mat - a `MATSEQSELL` matrix
1925  .  array - pointer to the data
1926 
1927  Level: intermediate
1928 
1929  .seealso: `MATSEQSELL`, `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
1930  @*/
1931 PetscErrorCode MatSeqSELLRestoreArray(Mat A, PetscScalar **array)
1932 {
1933   PetscFunctionBegin;
1934   PetscUseMethod(A, "MatSeqSELLRestoreArray_C", (Mat, PetscScalar **), (A, array));
1935   PetscFunctionReturn(0);
1936 }
1937 
1938 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1939 {
1940   Mat_SeqSELL *b;
1941   PetscMPIInt  size;
1942 
1943   PetscFunctionBegin;
1944   PetscCall(PetscCitationsRegister(citation, &cited));
1945   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
1946   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1");
1947 
1948   PetscCall(PetscNew(&b));
1949 
1950   B->data = (void *)b;
1951 
1952   PetscCall(PetscMemcpy(B->ops, &MatOps_Values, sizeof(struct _MatOps)));
1953 
1954   b->row                = NULL;
1955   b->col                = NULL;
1956   b->icol               = NULL;
1957   b->reallocs           = 0;
1958   b->ignorezeroentries  = PETSC_FALSE;
1959   b->roworiented        = PETSC_TRUE;
1960   b->nonew              = 0;
1961   b->diag               = NULL;
1962   b->solve_work         = NULL;
1963   B->spptr              = NULL;
1964   b->saved_values       = NULL;
1965   b->idiag              = NULL;
1966   b->mdiag              = NULL;
1967   b->ssor_work          = NULL;
1968   b->omega              = 1.0;
1969   b->fshift             = 0.0;
1970   b->idiagvalid         = PETSC_FALSE;
1971   b->keepnonzeropattern = PETSC_FALSE;
1972 
1973   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL));
1974   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL));
1975   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL));
1976   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL));
1977   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL));
1978   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL));
1979   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ));
1980   PetscFunctionReturn(0);
1981 }
1982 
1983 /*
1984  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1985  */
1986 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace)
1987 {
1988   Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data;
1989   PetscInt     i, m                           = A->rmap->n;
1990   PetscInt     totalslices = a->totalslices;
1991 
1992   PetscFunctionBegin;
1993   C->factortype = A->factortype;
1994   c->row        = NULL;
1995   c->col        = NULL;
1996   c->icol       = NULL;
1997   c->reallocs   = 0;
1998   C->assembled  = PETSC_TRUE;
1999 
2000   PetscCall(PetscLayoutReference(A->rmap, &C->rmap));
2001   PetscCall(PetscLayoutReference(A->cmap, &C->cmap));
2002 
2003   PetscCall(PetscMalloc1(8 * totalslices, &c->rlen));
2004   PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx));
2005 
2006   for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i];
2007   for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i];
2008 
2009   /* allocate the matrix space */
2010   if (mallocmatspace) {
2011     PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx));
2012 
2013     c->singlemalloc = PETSC_TRUE;
2014 
2015     if (m > 0) {
2016       PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat));
2017       if (cpvalues == MAT_COPY_VALUES) {
2018         PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat));
2019       } else {
2020         PetscCall(PetscArrayzero(c->val, a->maxallocmat));
2021       }
2022     }
2023   }
2024 
2025   c->ignorezeroentries = a->ignorezeroentries;
2026   c->roworiented       = a->roworiented;
2027   c->nonew             = a->nonew;
2028   if (a->diag) {
2029     PetscCall(PetscMalloc1(m, &c->diag));
2030     for (i = 0; i < m; i++) c->diag[i] = a->diag[i];
2031   } else c->diag = NULL;
2032 
2033   c->solve_work         = NULL;
2034   c->saved_values       = NULL;
2035   c->idiag              = NULL;
2036   c->ssor_work          = NULL;
2037   c->keepnonzeropattern = a->keepnonzeropattern;
2038   c->free_val           = PETSC_TRUE;
2039   c->free_colidx        = PETSC_TRUE;
2040 
2041   c->maxallocmat  = a->maxallocmat;
2042   c->maxallocrow  = a->maxallocrow;
2043   c->rlenmax      = a->rlenmax;
2044   c->nz           = a->nz;
2045   C->preallocated = PETSC_TRUE;
2046 
2047   c->nonzerorowcnt = a->nonzerorowcnt;
2048   C->nonzerostate  = A->nonzerostate;
2049 
2050   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist));
2051   PetscFunctionReturn(0);
2052 }
2053 
2054 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B)
2055 {
2056   PetscFunctionBegin;
2057   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2058   PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n));
2059   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2060   PetscCall(MatSetType(*B, ((PetscObject)A)->type_name));
2061   PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE));
2062   PetscFunctionReturn(0);
2063 }
2064 
2065 /*MC
2066    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2067    based on the sliced Ellpack format
2068 
2069    Options Database Keys:
2070 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()`
2071 
2072    Level: beginner
2073 
2074 .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2075 M*/
2076 
2077 /*MC
2078    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2079 
2080    This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator,
2081    and `MATMPISELL` otherwise.  As a result, for single process communicators,
2082   `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2083   for communicators controlling multiple processes.  It is recommended that you call both of
2084   the above preallocation routines for simplicity.
2085 
2086    Options Database Keys:
2087 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2088 
2089   Level: beginner
2090 
2091   Notes:
2092    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2093 
2094    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2095    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2096 
2097   Developer Notes:
2098    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2099 
2100    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2101 .vb
2102                             (2 0  3 4)
2103    Consider the matrix A =  (5 0  6 0)
2104                             (0 0  7 8)
2105                             (0 0  9 9)
2106 
2107    symbolically the Ellpack format can be written as
2108 
2109         (2 3 4 |)           (0 2 3 |)
2110    v =  (5 6 0 |)  colidx = (0 2 2 |)
2111         --------            ---------
2112         (7 8 |)             (2 3 |)
2113         (9 9 |)             (2 3 |)
2114 
2115     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2116     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2117     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2118 
2119     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2120 
2121 .ve
2122 
2123       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2124 
2125  References:
2126 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2127    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2128 
2129 .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2130 M*/
2131 
2132 /*@C
2133        MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format.
2134 
2135  Collective on comm
2136 
2137  Input Parameters:
2138 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
2139 .  m - number of rows
2140 .  n - number of columns
2141 .  rlenmax - maximum number of nonzeros in a row
2142 -  rlen - array containing the number of nonzeros in the various rows
2143  (possibly different for each row) or NULL
2144 
2145  Output Parameter:
2146 .  A - the matrix
2147 
2148  It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
2149  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2150  [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`]
2151 
2152  Notes:
2153  If nnz is given then nz is ignored
2154 
2155  Specify the preallocated storage with either rlenmax or rlen (not both).
2156  Set rlenmax = `PETSC_DEFAULT` and rlen = NULL for PETSc to control dynamic memory
2157  allocation.  For large problems you MUST preallocate memory or you
2158  will get TERRIBLE performance, see the users' manual chapter on matrices.
2159 
2160  Level: intermediate
2161 
2162  .seealso: `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2163  @*/
2164 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt maxallocrow, const PetscInt rlen[], Mat *A)
2165 {
2166   PetscFunctionBegin;
2167   PetscCall(MatCreate(comm, A));
2168   PetscCall(MatSetSizes(*A, m, n, m, n));
2169   PetscCall(MatSetType(*A, MATSEQSELL));
2170   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, maxallocrow, rlen));
2171   PetscFunctionReturn(0);
2172 }
2173 
2174 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg)
2175 {
2176   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data;
2177   PetscInt     totalslices = a->totalslices;
2178 
2179   PetscFunctionBegin;
2180   /* If the  matrix dimensions are not equal,or no of nonzeros */
2181   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2182     *flg = PETSC_FALSE;
2183     PetscFunctionReturn(0);
2184   }
2185   /* if the a->colidx are the same */
2186   PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg));
2187   if (!*flg) PetscFunctionReturn(0);
2188   /* if a->val are the same */
2189   PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg));
2190   PetscFunctionReturn(0);
2191 }
2192 
2193 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2194 {
2195   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2196 
2197   PetscFunctionBegin;
2198   a->idiagvalid = PETSC_FALSE;
2199   PetscFunctionReturn(0);
2200 }
2201 
2202 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2203 {
2204 #if defined(PETSC_USE_COMPLEX)
2205   Mat_SeqSELL *a = (Mat_SeqSELL *)A->data;
2206   PetscInt     i;
2207   PetscScalar *val = a->val;
2208 
2209   PetscFunctionBegin;
2210   for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]);
2211 #else
2212   PetscFunctionBegin;
2213 #endif
2214   PetscFunctionReturn(0);
2215 }
2216