xref: /petsc/src/mat/impls/sell/seq/sell.c (revision 54aabf2d6bfa5240e2c46d4bced775e6fab499d8)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h>  /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited = PETSC_FALSE;
10 static const char citation[] =
11   "@inproceedings{ZhangELLPACK2018,\n"
12   " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
13   " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
14   " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
15   " year = 2018\n"
16 "}\n";
17 
18 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
19 
20   #include <immintrin.h>
21 
22   #if !defined(_MM_SCALE_8)
23   #define _MM_SCALE_8    8
24   #endif
25 
26   #if defined(__AVX512F__)
27   /* these do not work
28    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
29    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
30   */
31     #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
32     /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
33     vec_idx  = _mm256_loadu_si256((__m256i const*)acolidx); \
34     vec_vals = _mm512_loadu_pd(aval); \
35     vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
36     vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
37   #elif defined(__AVX2__) && defined(__FMA__)
38     #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
39     vec_vals = _mm256_loadu_pd(aval); \
40     vec_idx  = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \
41     vec_x    = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
42     vec_y    = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
43   #endif
44 #endif  /* PETSC_HAVE_IMMINTRIN_H */
45 
46 /*@C
47  MatSeqSELLSetPreallocation - For good matrix assembly performance
48  the user should preallocate the matrix storage by setting the parameter nz
49  (or the array nnz).  By setting these parameters accurately, performance
50  during matrix assembly can be increased significantly.
51 
52  Collective
53 
54  Input Parameters:
55  +  B - The matrix
56  .  nz - number of nonzeros per row (same for all rows)
57  -  nnz - array containing the number of nonzeros in the various rows
58  (possibly different for each row) or NULL
59 
60  Notes:
61  If nnz is given then nz is ignored.
62 
63  Specify the preallocated storage with either nz or nnz (not both).
64  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
65  allocation.  For large problems you MUST preallocate memory or you
66  will get TERRIBLE performance, see the users' manual chapter on matrices.
67 
68  You can call MatGetInfo() to get information on how effective the preallocation was;
69  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
70  You can also run with the option -info and look for messages with the string
71  malloc in them to see if additional memory allocation was needed.
72 
73  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
74  entries or columns indices.
75 
76  The maximum number of nonzeos in any row should be as accurate as possible.
77  If it is underestimated, you will get bad performance due to reallocation
78  (MatSeqXSELLReallocateSELL).
79 
80  Level: intermediate
81 
82  .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
83 
84  @*/
85 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
86 {
87   PetscFunctionBegin;
88   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
89   PetscValidType(B,1);
90   PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));
91   PetscFunctionReturn(0);
92 }
93 
94 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
95 {
96   Mat_SeqSELL    *b;
97   PetscInt       i,j,totalslices;
98   PetscBool      skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;
99 
100   PetscFunctionBegin;
101   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
102   if (maxallocrow == MAT_SKIP_ALLOCATION) {
103     skipallocation = PETSC_TRUE;
104     maxallocrow    = 0;
105   }
106 
107   PetscCall(PetscLayoutSetUp(B->rmap));
108   PetscCall(PetscLayoutSetUp(B->cmap));
109 
110   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
111   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
112   PetscCheck(maxallocrow >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %" PetscInt_FMT,maxallocrow);
113   if (rlen) {
114     for (i=0; i<B->rmap->n; i++) {
115       PetscCheck(rlen[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,rlen[i]);
116       PetscCheck(rlen[i] <= B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT,i,rlen[i],B->cmap->n);
117     }
118   }
119 
120   B->preallocated = PETSC_TRUE;
121 
122   b = (Mat_SeqSELL*)B->data;
123 
124   totalslices = PetscCeilInt(B->rmap->n,8);
125   b->totalslices = totalslices;
126   if (!skipallocation) {
127     if (B->rmap->n & 0x07) PetscCall(PetscInfo(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n",B->rmap->n));
128 
129     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
130       PetscCall(PetscMalloc1(totalslices+1,&b->sliidx));
131       PetscCall(PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt)));
132     }
133     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
134       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
135       else if (maxallocrow < 0) maxallocrow = 1;
136       for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
137     } else {
138       maxallocrow = 0;
139       b->sliidx[0] = 0;
140       for (i=1; i<totalslices; i++) {
141         b->sliidx[i] = 0;
142         for (j=0;j<8;j++) {
143           b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
144         }
145         maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
146         PetscCall(PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i]));
147       }
148       /* last slice */
149       b->sliidx[totalslices] = 0;
150       for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
151       maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
152       b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
153     }
154 
155     /* allocate space for val, colidx, rlen */
156     /* FIXME: should B's old memory be unlogged? */
157     PetscCall(MatSeqXSELLFreeSELL(B,&b->val,&b->colidx));
158     /* FIXME: assuming an element of the bit array takes 8 bits */
159     PetscCall(PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx));
160     PetscCall(PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt))));
161     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
162     PetscCall(PetscCalloc1(8*totalslices,&b->rlen));
163     PetscCall(PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt)));
164 
165     b->singlemalloc = PETSC_TRUE;
166     b->free_val     = PETSC_TRUE;
167     b->free_colidx  = PETSC_TRUE;
168   } else {
169     b->free_val    = PETSC_FALSE;
170     b->free_colidx = PETSC_FALSE;
171   }
172 
173   b->nz               = 0;
174   b->maxallocrow      = maxallocrow;
175   b->rlenmax          = maxallocrow;
176   b->maxallocmat      = b->sliidx[totalslices];
177   B->info.nz_unneeded = (double)b->maxallocmat;
178   if (realalloc) PetscCall(MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
179   PetscFunctionReturn(0);
180 }
181 
182 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
183 {
184   Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
185   PetscInt    shift;
186 
187   PetscFunctionBegin;
188   PetscCheck(row >= 0 && row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %" PetscInt_FMT " out of range",row);
189   if (nz) *nz = a->rlen[row];
190   shift = a->sliidx[row>>3]+(row&0x07);
191   if (!a->getrowcols) {
192 
193     PetscCall(PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals));
194   }
195   if (idx) {
196     PetscInt j;
197     for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
198     *idx = a->getrowcols;
199   }
200   if (v) {
201     PetscInt j;
202     for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
203     *v = a->getrowvals;
204   }
205   PetscFunctionReturn(0);
206 }
207 
208 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
209 {
210   PetscFunctionBegin;
211   PetscFunctionReturn(0);
212 }
213 
214 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
215 {
216   Mat            B;
217   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
218   PetscInt       i;
219 
220   PetscFunctionBegin;
221   if (reuse == MAT_REUSE_MATRIX) {
222     B    = *newmat;
223     PetscCall(MatZeroEntries(B));
224   } else {
225     PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B));
226     PetscCall(MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N));
227     PetscCall(MatSetType(B,MATSEQAIJ));
228     PetscCall(MatSeqAIJSetPreallocation(B,0,a->rlen));
229   }
230 
231   for (i=0; i<A->rmap->n; i++) {
232     PetscInt    nz = 0,*cols = NULL;
233     PetscScalar *vals = NULL;
234 
235     PetscCall(MatGetRow_SeqSELL(A,i,&nz,&cols,&vals));
236     PetscCall(MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES));
237     PetscCall(MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals));
238   }
239 
240   PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
241   PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
242   B->rmap->bs = A->rmap->bs;
243 
244   if (reuse == MAT_INPLACE_MATRIX) {
245     PetscCall(MatHeaderReplace(A,&B));
246   } else {
247     *newmat = B;
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 #include <../src/mat/impls/aij/seq/aij.h>
253 
254 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
255 {
256   Mat               B;
257   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
258   PetscInt          *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
259   const PetscInt    *cols;
260   const PetscScalar *vals;
261 
262   PetscFunctionBegin;
263 
264   if (reuse == MAT_REUSE_MATRIX) {
265     B = *newmat;
266   } else {
267     if (PetscDefined(USE_DEBUG) || !a->ilen) {
268       PetscCall(PetscMalloc1(m,&rowlengths));
269       for (i=0; i<m; i++) {
270         rowlengths[i] = ai[i+1] - ai[i];
271       }
272     }
273     if (PetscDefined(USE_DEBUG) && a->ilen) {
274       PetscBool eq;
275       PetscCall(PetscMemcmp(rowlengths,a->ilen,m*sizeof(PetscInt),&eq));
276       PetscCheck(eq,PETSC_COMM_SELF,PETSC_ERR_PLIB,"SeqAIJ ilen array incorrect");
277       PetscCall(PetscFree(rowlengths));
278       rowlengths = a->ilen;
279     } else if (a->ilen) rowlengths = a->ilen;
280     PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B));
281     PetscCall(MatSetSizes(B,m,n,m,n));
282     PetscCall(MatSetType(B,MATSEQSELL));
283     PetscCall(MatSeqSELLSetPreallocation(B,0,rowlengths));
284     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
285   }
286 
287   for (row=0; row<m; row++) {
288     PetscCall(MatGetRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals));
289     PetscCall(MatSetValues_SeqSELL(B,1,&row,ncols,cols,vals,INSERT_VALUES));
290     PetscCall(MatRestoreRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals));
291   }
292   PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
293   PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
294   B->rmap->bs = A->rmap->bs;
295 
296   if (reuse == MAT_INPLACE_MATRIX) {
297     PetscCall(MatHeaderReplace(A,&B));
298   } else {
299     *newmat = B;
300   }
301   PetscFunctionReturn(0);
302 }
303 
304 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
305 {
306   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
307   PetscScalar       *y;
308   const PetscScalar *x;
309   const MatScalar   *aval=a->val;
310   PetscInt          totalslices=a->totalslices;
311   const PetscInt    *acolidx=a->colidx;
312   PetscInt          i,j;
313 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
314   __m512d           vec_x,vec_y,vec_vals;
315   __m256i           vec_idx;
316   __mmask8          mask;
317   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
318   __m256i           vec_idx2,vec_idx3,vec_idx4;
319 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
320   __m128i           vec_idx;
321   __m256d           vec_x,vec_y,vec_y2,vec_vals;
322   MatScalar         yval;
323   PetscInt          r,rows_left,row,nnz_in_row;
324 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
325   __m128d           vec_x_tmp;
326   __m256d           vec_x,vec_y,vec_y2,vec_vals;
327   MatScalar         yval;
328   PetscInt          r,rows_left,row,nnz_in_row;
329 #else
330   PetscScalar       sum[8];
331 #endif
332 
333 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
334 #pragma disjoint(*x,*y,*aval)
335 #endif
336 
337   PetscFunctionBegin;
338   PetscCall(VecGetArrayRead(xx,&x));
339   PetscCall(VecGetArray(yy,&y));
340 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
341   for (i=0; i<totalslices; i++) { /* loop over slices */
342     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
343     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
344 
345     vec_y  = _mm512_setzero_pd();
346     vec_y2 = _mm512_setzero_pd();
347     vec_y3 = _mm512_setzero_pd();
348     vec_y4 = _mm512_setzero_pd();
349 
350     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
351     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
352     case 3:
353       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
354       acolidx += 8; aval += 8;
355       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
356       acolidx += 8; aval += 8;
357       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
358       acolidx += 8; aval += 8;
359       j += 3;
360       break;
361     case 2:
362       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
363       acolidx += 8; aval += 8;
364       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
365       acolidx += 8; aval += 8;
366       j += 2;
367       break;
368     case 1:
369       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
370       acolidx += 8; aval += 8;
371       j += 1;
372       break;
373     }
374     #pragma novector
375     for (; j<(a->sliidx[i+1]>>3); j+=4) {
376       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
377       acolidx += 8; aval += 8;
378       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
379       acolidx += 8; aval += 8;
380       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
381       acolidx += 8; aval += 8;
382       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
383       acolidx += 8; aval += 8;
384     }
385 
386     vec_y = _mm512_add_pd(vec_y,vec_y2);
387     vec_y = _mm512_add_pd(vec_y,vec_y3);
388     vec_y = _mm512_add_pd(vec_y,vec_y4);
389     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
390       mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
391       _mm512_mask_storeu_pd(&y[8*i],mask,vec_y);
392     } else {
393       _mm512_storeu_pd(&y[8*i],vec_y);
394     }
395   }
396 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
397   for (i=0; i<totalslices; i++) { /* loop over full slices */
398     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
399     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
400 
401     /* last slice may have padding rows. Don't use vectorization. */
402     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
403       rows_left = A->rmap->n - 8*i;
404       for (r=0; r<rows_left; ++r) {
405         yval = (MatScalar)0;
406         row = 8*i + r;
407         nnz_in_row = a->rlen[row];
408         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
409         y[row] = yval;
410       }
411       break;
412     }
413 
414     vec_y  = _mm256_setzero_pd();
415     vec_y2 = _mm256_setzero_pd();
416 
417     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
418     #pragma novector
419     #pragma unroll(2)
420     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
421       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
422       aval += 4; acolidx += 4;
423       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
424       aval += 4; acolidx += 4;
425     }
426 
427     _mm256_storeu_pd(y+i*8,vec_y);
428     _mm256_storeu_pd(y+i*8+4,vec_y2);
429   }
430 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
431   for (i=0; i<totalslices; i++) { /* loop over full slices */
432     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
433     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
434 
435     vec_y  = _mm256_setzero_pd();
436     vec_y2 = _mm256_setzero_pd();
437 
438     /* last slice may have padding rows. Don't use vectorization. */
439     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
440       rows_left = A->rmap->n - 8*i;
441       for (r=0; r<rows_left; ++r) {
442         yval = (MatScalar)0;
443         row = 8*i + r;
444         nnz_in_row = a->rlen[row];
445         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
446         y[row] = yval;
447       }
448       break;
449     }
450 
451     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
452     #pragma novector
453     #pragma unroll(2)
454     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
455       vec_vals  = _mm256_loadu_pd(aval);
456       vec_x_tmp = _mm_setzero_pd();
457       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
458       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
459       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
460       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
461       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
462       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
463       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
464       aval     += 4;
465 
466       vec_vals  = _mm256_loadu_pd(aval);
467       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
468       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
469       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
470       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
471       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
472       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
473       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
474       aval     += 4;
475     }
476 
477     _mm256_storeu_pd(y + i*8,     vec_y);
478     _mm256_storeu_pd(y + i*8 + 4, vec_y2);
479   }
480 #else
481   for (i=0; i<totalslices; i++) { /* loop over slices */
482     for (j=0; j<8; j++) sum[j] = 0.0;
483     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
484       sum[0] += aval[j] * x[acolidx[j]];
485       sum[1] += aval[j+1] * x[acolidx[j+1]];
486       sum[2] += aval[j+2] * x[acolidx[j+2]];
487       sum[3] += aval[j+3] * x[acolidx[j+3]];
488       sum[4] += aval[j+4] * x[acolidx[j+4]];
489       sum[5] += aval[j+5] * x[acolidx[j+5]];
490       sum[6] += aval[j+6] * x[acolidx[j+6]];
491       sum[7] += aval[j+7] * x[acolidx[j+7]];
492     }
493     if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
494       for (j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
495     } else {
496       for (j=0; j<8; j++) y[8*i+j] = sum[j];
497     }
498   }
499 #endif
500 
501   PetscCall(PetscLogFlops(2.0*a->nz-a->nonzerorowcnt)); /* theoretical minimal FLOPs */
502   PetscCall(VecRestoreArrayRead(xx,&x));
503   PetscCall(VecRestoreArray(yy,&y));
504   PetscFunctionReturn(0);
505 }
506 
507 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
508 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
509 {
510   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
511   PetscScalar       *y,*z;
512   const PetscScalar *x;
513   const MatScalar   *aval=a->val;
514   PetscInt          totalslices=a->totalslices;
515   const PetscInt    *acolidx=a->colidx;
516   PetscInt          i,j;
517 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
518   __m512d           vec_x,vec_y,vec_vals;
519   __m256i           vec_idx;
520   __mmask8          mask;
521   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
522   __m256i           vec_idx2,vec_idx3,vec_idx4;
523 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
524   __m128d           vec_x_tmp;
525   __m256d           vec_x,vec_y,vec_y2,vec_vals;
526   MatScalar         yval;
527   PetscInt          r,row,nnz_in_row;
528 #else
529   PetscScalar       sum[8];
530 #endif
531 
532 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
533 #pragma disjoint(*x,*y,*aval)
534 #endif
535 
536   PetscFunctionBegin;
537   PetscCall(VecGetArrayRead(xx,&x));
538   PetscCall(VecGetArrayPair(yy,zz,&y,&z));
539 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
540   for (i=0; i<totalslices; i++) { /* loop over slices */
541     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
542     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
543 
544     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
545       mask   = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
546       vec_y  = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]);
547     } else {
548       vec_y  = _mm512_loadu_pd(&y[8*i]);
549     }
550     vec_y2 = _mm512_setzero_pd();
551     vec_y3 = _mm512_setzero_pd();
552     vec_y4 = _mm512_setzero_pd();
553 
554     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
555     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
556     case 3:
557       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
558       acolidx += 8; aval += 8;
559       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
560       acolidx += 8; aval += 8;
561       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
562       acolidx += 8; aval += 8;
563       j += 3;
564       break;
565     case 2:
566       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
567       acolidx += 8; aval += 8;
568       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
569       acolidx += 8; aval += 8;
570       j += 2;
571       break;
572     case 1:
573       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
574       acolidx += 8; aval += 8;
575       j += 1;
576       break;
577     }
578     #pragma novector
579     for (; j<(a->sliidx[i+1]>>3); j+=4) {
580       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
581       acolidx += 8; aval += 8;
582       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
583       acolidx += 8; aval += 8;
584       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
585       acolidx += 8; aval += 8;
586       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
587       acolidx += 8; aval += 8;
588     }
589 
590     vec_y = _mm512_add_pd(vec_y,vec_y2);
591     vec_y = _mm512_add_pd(vec_y,vec_y3);
592     vec_y = _mm512_add_pd(vec_y,vec_y4);
593     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
594       _mm512_mask_storeu_pd(&z[8*i],mask,vec_y);
595     } else {
596       _mm512_storeu_pd(&z[8*i],vec_y);
597     }
598   }
599 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
600   for (i=0; i<totalslices; i++) { /* loop over full slices */
601     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
602     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
603 
604     /* last slice may have padding rows. Don't use vectorization. */
605     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
606       for (r=0; r<(A->rmap->n & 0x07); ++r) {
607         row        = 8*i + r;
608         yval       = (MatScalar)0.0;
609         nnz_in_row = a->rlen[row];
610         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
611         z[row] = y[row] + yval;
612       }
613       break;
614     }
615 
616     vec_y  = _mm256_loadu_pd(y+8*i);
617     vec_y2 = _mm256_loadu_pd(y+8*i+4);
618 
619     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
620     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
621       vec_vals  = _mm256_loadu_pd(aval);
622       vec_x_tmp = _mm_setzero_pd();
623       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
624       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
625       vec_x     = _mm256_setzero_pd();
626       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
627       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
628       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
629       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
630       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
631       aval     += 4;
632 
633       vec_vals  = _mm256_loadu_pd(aval);
634       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
635       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
636       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
637       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
638       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
639       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
640       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
641       aval     += 4;
642     }
643 
644     _mm256_storeu_pd(z+i*8,vec_y);
645     _mm256_storeu_pd(z+i*8+4,vec_y2);
646   }
647 #else
648   for (i=0; i<totalslices; i++) { /* loop over slices */
649     for (j=0; j<8; j++) sum[j] = 0.0;
650     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
651       sum[0] += aval[j] * x[acolidx[j]];
652       sum[1] += aval[j+1] * x[acolidx[j+1]];
653       sum[2] += aval[j+2] * x[acolidx[j+2]];
654       sum[3] += aval[j+3] * x[acolidx[j+3]];
655       sum[4] += aval[j+4] * x[acolidx[j+4]];
656       sum[5] += aval[j+5] * x[acolidx[j+5]];
657       sum[6] += aval[j+6] * x[acolidx[j+6]];
658       sum[7] += aval[j+7] * x[acolidx[j+7]];
659     }
660     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
661       for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
662     } else {
663       for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
664     }
665   }
666 #endif
667 
668   PetscCall(PetscLogFlops(2.0*a->nz));
669   PetscCall(VecRestoreArrayRead(xx,&x));
670   PetscCall(VecRestoreArrayPair(yy,zz,&y,&z));
671   PetscFunctionReturn(0);
672 }
673 
674 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
675 {
676   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
677   PetscScalar       *y;
678   const PetscScalar *x;
679   const MatScalar   *aval=a->val;
680   const PetscInt    *acolidx=a->colidx;
681   PetscInt          i,j,r,row,nnz_in_row,totalslices=a->totalslices;
682 
683 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
684 #pragma disjoint(*x,*y,*aval)
685 #endif
686 
687   PetscFunctionBegin;
688   if (A->symmetric) {
689     PetscCall(MatMultAdd_SeqSELL(A,xx,zz,yy));
690     PetscFunctionReturn(0);
691   }
692   if (zz != yy) PetscCall(VecCopy(zz,yy));
693   PetscCall(VecGetArrayRead(xx,&x));
694   PetscCall(VecGetArray(yy,&y));
695   for (i=0; i<a->totalslices; i++) { /* loop over slices */
696     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
697       for (r=0; r<(A->rmap->n & 0x07); ++r) {
698         row        = 8*i + r;
699         nnz_in_row = a->rlen[row];
700         for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
701       }
702       break;
703     }
704     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
705       y[acolidx[j]]   += aval[j] * x[8*i];
706       y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
707       y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
708       y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
709       y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
710       y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
711       y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
712       y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
713     }
714   }
715   PetscCall(PetscLogFlops(2.0*a->sliidx[a->totalslices]));
716   PetscCall(VecRestoreArrayRead(xx,&x));
717   PetscCall(VecRestoreArray(yy,&y));
718   PetscFunctionReturn(0);
719 }
720 
721 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
722 {
723   PetscFunctionBegin;
724   if (A->symmetric) {
725     PetscCall(MatMult_SeqSELL(A,xx,yy));
726   } else {
727     PetscCall(VecSet(yy,0.0));
728     PetscCall(MatMultTransposeAdd_SeqSELL(A,xx,yy,yy));
729   }
730   PetscFunctionReturn(0);
731 }
732 
733 /*
734      Checks for missing diagonals
735 */
736 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool  *missing,PetscInt *d)
737 {
738   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
739   PetscInt       *diag,i;
740 
741   PetscFunctionBegin;
742   *missing = PETSC_FALSE;
743   if (A->rmap->n > 0 && !(a->colidx)) {
744     *missing = PETSC_TRUE;
745     if (d) *d = 0;
746     PetscCall(PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n"));
747   } else {
748     diag = a->diag;
749     for (i=0; i<A->rmap->n; i++) {
750       if (diag[i] == -1) {
751         *missing = PETSC_TRUE;
752         if (d) *d = i;
753         PetscCall(PetscInfo(A,"Matrix is missing diagonal number %" PetscInt_FMT "\n",i));
754         break;
755       }
756     }
757   }
758   PetscFunctionReturn(0);
759 }
760 
761 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
762 {
763   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
764   PetscInt       i,j,m=A->rmap->n,shift;
765 
766   PetscFunctionBegin;
767   if (!a->diag) {
768     PetscCall(PetscMalloc1(m,&a->diag));
769     PetscCall(PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt)));
770     a->free_diag = PETSC_TRUE;
771   }
772   for (i=0; i<m; i++) { /* loop over rows */
773     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
774     a->diag[i] = -1;
775     for (j=0; j<a->rlen[i]; j++) {
776       if (a->colidx[shift+j*8] == i) {
777         a->diag[i] = shift+j*8;
778         break;
779       }
780     }
781   }
782   PetscFunctionReturn(0);
783 }
784 
785 /*
786   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
787 */
788 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
789 {
790   Mat_SeqSELL    *a=(Mat_SeqSELL*) A->data;
791   PetscInt       i,*diag,m = A->rmap->n;
792   MatScalar      *val = a->val;
793   PetscScalar    *idiag,*mdiag;
794 
795   PetscFunctionBegin;
796   if (a->idiagvalid) PetscFunctionReturn(0);
797   PetscCall(MatMarkDiagonal_SeqSELL(A));
798   diag = a->diag;
799   if (!a->idiag) {
800     PetscCall(PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work));
801     PetscCall(PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar)));
802     val  = a->val;
803   }
804   mdiag = a->mdiag;
805   idiag = a->idiag;
806 
807   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
808     for (i=0; i<m; i++) {
809       mdiag[i] = val[diag[i]];
810       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
811         if (PetscRealPart(fshift)) {
812           PetscCall(PetscInfo(A,"Zero diagonal on row %" PetscInt_FMT "\n",i));
813           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
814           A->factorerror_zeropivot_value = 0.0;
815           A->factorerror_zeropivot_row   = i;
816         } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %" PetscInt_FMT,i);
817       }
818       idiag[i] = 1.0/val[diag[i]];
819     }
820     PetscCall(PetscLogFlops(m));
821   } else {
822     for (i=0; i<m; i++) {
823       mdiag[i] = val[diag[i]];
824       idiag[i] = omega/(fshift + val[diag[i]]);
825     }
826     PetscCall(PetscLogFlops(2.0*m));
827   }
828   a->idiagvalid = PETSC_TRUE;
829   PetscFunctionReturn(0);
830 }
831 
832 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
833 {
834   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
835 
836   PetscFunctionBegin;
837   PetscCall(PetscArrayzero(a->val,a->sliidx[a->totalslices]));
838   PetscCall(MatSeqSELLInvalidateDiagonal(A));
839   PetscFunctionReturn(0);
840 }
841 
842 PetscErrorCode MatDestroy_SeqSELL(Mat A)
843 {
844   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
845 
846   PetscFunctionBegin;
847 #if defined(PETSC_USE_LOG)
848   PetscLogObjectState((PetscObject)A,"Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT,A->rmap->n,A->cmap->n,a->nz);
849 #endif
850   PetscCall(MatSeqXSELLFreeSELL(A,&a->val,&a->colidx));
851   PetscCall(ISDestroy(&a->row));
852   PetscCall(ISDestroy(&a->col));
853   PetscCall(PetscFree(a->diag));
854   PetscCall(PetscFree(a->rlen));
855   PetscCall(PetscFree(a->sliidx));
856   PetscCall(PetscFree3(a->idiag,a->mdiag,a->ssor_work));
857   PetscCall(PetscFree(a->solve_work));
858   PetscCall(ISDestroy(&a->icol));
859   PetscCall(PetscFree(a->saved_values));
860   PetscCall(PetscFree2(a->getrowcols,a->getrowvals));
861 
862   PetscCall(PetscFree(A->data));
863 
864   PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL));
865   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL));
866   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL));
867   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL));
868   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLGetArray_C",NULL));
869   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLRestoreArray_C",NULL));
870   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqsell_seqaij_C",NULL));
871   PetscFunctionReturn(0);
872 }
873 
874 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
875 {
876   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
877 
878   PetscFunctionBegin;
879   switch (op) {
880   case MAT_ROW_ORIENTED:
881     a->roworiented = flg;
882     break;
883   case MAT_KEEP_NONZERO_PATTERN:
884     a->keepnonzeropattern = flg;
885     break;
886   case MAT_NEW_NONZERO_LOCATIONS:
887     a->nonew = (flg ? 0 : 1);
888     break;
889   case MAT_NEW_NONZERO_LOCATION_ERR:
890     a->nonew = (flg ? -1 : 0);
891     break;
892   case MAT_NEW_NONZERO_ALLOCATION_ERR:
893     a->nonew = (flg ? -2 : 0);
894     break;
895   case MAT_UNUSED_NONZERO_LOCATION_ERR:
896     a->nounused = (flg ? -1 : 0);
897     break;
898   case MAT_FORCE_DIAGONAL_ENTRIES:
899   case MAT_IGNORE_OFF_PROC_ENTRIES:
900   case MAT_USE_HASH_TABLE:
901   case MAT_SORTED_FULL:
902     PetscCall(PetscInfo(A,"Option %s ignored\n",MatOptions[op]));
903     break;
904   case MAT_SPD:
905   case MAT_SYMMETRIC:
906   case MAT_STRUCTURALLY_SYMMETRIC:
907   case MAT_HERMITIAN:
908   case MAT_SYMMETRY_ETERNAL:
909     /* These options are handled directly by MatSetOption() */
910     break;
911   default:
912     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
913   }
914   PetscFunctionReturn(0);
915 }
916 
917 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
918 {
919   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
920   PetscInt       i,j,n,shift;
921   PetscScalar    *x,zero=0.0;
922 
923   PetscFunctionBegin;
924   PetscCall(VecGetLocalSize(v,&n));
925   PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
926 
927   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
928     PetscInt *diag=a->diag;
929     PetscCall(VecGetArray(v,&x));
930     for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
931     PetscCall(VecRestoreArray(v,&x));
932     PetscFunctionReturn(0);
933   }
934 
935   PetscCall(VecSet(v,zero));
936   PetscCall(VecGetArray(v,&x));
937   for (i=0; i<n; i++) { /* loop over rows */
938     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
939     x[i] = 0;
940     for (j=0; j<a->rlen[i]; j++) {
941       if (a->colidx[shift+j*8] == i) {
942         x[i] = a->val[shift+j*8];
943         break;
944       }
945     }
946   }
947   PetscCall(VecRestoreArray(v,&x));
948   PetscFunctionReturn(0);
949 }
950 
951 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
952 {
953   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
954   const PetscScalar *l,*r;
955   PetscInt          i,j,m,n,row;
956 
957   PetscFunctionBegin;
958   if (ll) {
959     /* The local size is used so that VecMPI can be passed to this routine
960        by MatDiagonalScale_MPISELL */
961     PetscCall(VecGetLocalSize(ll,&m));
962     PetscCheck(m == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
963     PetscCall(VecGetArrayRead(ll,&l));
964     for (i=0; i<a->totalslices; i++) { /* loop over slices */
965       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
966         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
967           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row];
968         }
969       } else {
970         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
971           a->val[j] *= l[8*i+row];
972         }
973       }
974     }
975     PetscCall(VecRestoreArrayRead(ll,&l));
976     PetscCall(PetscLogFlops(a->nz));
977   }
978   if (rr) {
979     PetscCall(VecGetLocalSize(rr,&n));
980     PetscCheck(n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
981     PetscCall(VecGetArrayRead(rr,&r));
982     for (i=0; i<a->totalslices; i++) { /* loop over slices */
983       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
984         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
985           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
986         }
987       } else {
988         for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
989           a->val[j] *= r[a->colidx[j]];
990         }
991       }
992     }
993     PetscCall(VecRestoreArrayRead(rr,&r));
994     PetscCall(PetscLogFlops(a->nz));
995   }
996   PetscCall(MatSeqSELLInvalidateDiagonal(A));
997   PetscFunctionReturn(0);
998 }
999 
1000 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
1001 {
1002   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1003   PetscInt    *cp,i,k,low,high,t,row,col,l;
1004   PetscInt    shift;
1005   MatScalar   *vp;
1006 
1007   PetscFunctionBegin;
1008   for (k=0; k<m; k++) { /* loop over requested rows */
1009     row = im[k];
1010     if (row<0) continue;
1011     PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1);
1012     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1013     cp = a->colidx+shift; /* pointer to the row */
1014     vp = a->val+shift; /* pointer to the row */
1015     for (l=0; l<n; l++) { /* loop over requested columns */
1016       col = in[l];
1017       if (col<0) continue;
1018       PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1);
1019       high = a->rlen[row]; low = 0; /* assume unsorted */
1020       while (high-low > 5) {
1021         t = (low+high)/2;
1022         if (*(cp+t*8) > col) high = t;
1023         else low = t;
1024       }
1025       for (i=low; i<high; i++) {
1026         if (*(cp+8*i) > col) break;
1027         if (*(cp+8*i) == col) {
1028           *v++ = *(vp+8*i);
1029           goto finished;
1030         }
1031       }
1032       *v++ = 0.0;
1033     finished:;
1034     }
1035   }
1036   PetscFunctionReturn(0);
1037 }
1038 
1039 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1040 {
1041   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1042   PetscInt          i,j,m=A->rmap->n,shift;
1043   const char        *name;
1044   PetscViewerFormat format;
1045 
1046   PetscFunctionBegin;
1047   PetscCall(PetscViewerGetFormat(viewer,&format));
1048   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1049     PetscInt nofinalvalue = 0;
1050     /*
1051     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1052       nofinalvalue = 1;
1053     }
1054     */
1055     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1056     PetscCall(PetscViewerASCIIPrintf(viewer,"%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n",m,A->cmap->n));
1057     PetscCall(PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %" PetscInt_FMT " \n",a->nz));
1058 #if defined(PETSC_USE_COMPLEX)
1059     PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",4);\n",a->nz+nofinalvalue));
1060 #else
1061     PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",3);\n",a->nz+nofinalvalue));
1062 #endif
1063     PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = [\n"));
1064 
1065     for (i=0; i<m; i++) {
1066       shift = a->sliidx[i>>3]+(i&0x07);
1067       for (j=0; j<a->rlen[i]; j++) {
1068 #if defined(PETSC_USE_COMPLEX)
1069         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1070 #else
1071         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]));
1072 #endif
1073       }
1074     }
1075     /*
1076     if (nofinalvalue) {
1077 #if defined(PETSC_USE_COMPLEX)
1078       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1079 #else
1080       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1081 #endif
1082     }
1083     */
1084     PetscCall(PetscObjectGetName((PetscObject)A,&name));
1085     PetscCall(PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name));
1086     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1087   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1088     PetscFunctionReturn(0);
1089   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1090     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1091     for (i=0; i<m; i++) {
1092       PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i));
1093       shift = a->sliidx[i>>3]+(i&0x07);
1094       for (j=0; j<a->rlen[i]; j++) {
1095 #if defined(PETSC_USE_COMPLEX)
1096         if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1097           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1098         } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1099           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j])));
1100         } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1101           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j])));
1102         }
1103 #else
1104         if (a->val[shift+8*j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]));
1105 #endif
1106       }
1107       PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1108     }
1109     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1110   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1111     PetscInt    cnt=0,jcnt;
1112     PetscScalar value;
1113 #if defined(PETSC_USE_COMPLEX)
1114     PetscBool   realonly=PETSC_TRUE;
1115     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1116       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1117         realonly = PETSC_FALSE;
1118         break;
1119       }
1120     }
1121 #endif
1122 
1123     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1124     for (i=0; i<m; i++) {
1125       jcnt = 0;
1126       shift = a->sliidx[i>>3]+(i&0x07);
1127       for (j=0; j<A->cmap->n; j++) {
1128         if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1129           value = a->val[cnt++];
1130           jcnt++;
1131         } else {
1132           value = 0.0;
1133         }
1134 #if defined(PETSC_USE_COMPLEX)
1135         if (realonly) {
1136           PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value)));
1137         } else {
1138           PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value)));
1139         }
1140 #else
1141         PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value));
1142 #endif
1143       }
1144       PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1145     }
1146     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1147   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1148     PetscInt fshift=1;
1149     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1150 #if defined(PETSC_USE_COMPLEX)
1151     PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n"));
1152 #else
1153     PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n"));
1154 #endif
1155     PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1156     for (i=0; i<m; i++) {
1157       shift = a->sliidx[i>>3]+(i&0x07);
1158       for (j=0; j<a->rlen[i]; j++) {
1159 #if defined(PETSC_USE_COMPLEX)
1160         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1161 #else
1162         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]));
1163 #endif
1164       }
1165     }
1166     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1167   } else if (format == PETSC_VIEWER_NATIVE) {
1168     for (i=0; i<a->totalslices; i++) { /* loop over slices */
1169       PetscInt row;
1170       PetscCall(PetscViewerASCIIPrintf(viewer,"slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n",i,a->sliidx[i],a->sliidx[i+1]));
1171       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1172 #if defined(PETSC_USE_COMPLEX)
1173         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1174           PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j])));
1175         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1176           PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j])));
1177         } else {
1178           PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j])));
1179         }
1180 #else
1181         PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)a->val[j]));
1182 #endif
1183       }
1184     }
1185   } else {
1186     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1187     if (A->factortype) {
1188       for (i=0; i<m; i++) {
1189         shift = a->sliidx[i>>3]+(i&0x07);
1190         PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i));
1191         /* L part */
1192         for (j=shift; j<a->diag[i]; j+=8) {
1193 #if defined(PETSC_USE_COMPLEX)
1194           if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1195             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j])));
1196           } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1197             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j]))));
1198           } else {
1199             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j])));
1200           }
1201 #else
1202           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j]));
1203 #endif
1204         }
1205         /* diagonal */
1206         j = a->diag[i];
1207 #if defined(PETSC_USE_COMPLEX)
1208         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1209           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j])));
1210         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1211           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j]))));
1212         } else {
1213           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j])));
1214         }
1215 #else
1216         PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)(1.0/a->val[j])));
1217 #endif
1218 
1219         /* U part */
1220         for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1221 #if defined(PETSC_USE_COMPLEX)
1222           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1223             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j])));
1224           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1225             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j]))));
1226           } else {
1227             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j])));
1228           }
1229 #else
1230           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j]));
1231 #endif
1232         }
1233         PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1234       }
1235     } else {
1236       for (i=0; i<m; i++) {
1237         shift = a->sliidx[i>>3]+(i&0x07);
1238         PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i));
1239         for (j=0; j<a->rlen[i]; j++) {
1240 #if defined(PETSC_USE_COMPLEX)
1241           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1242             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1243           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1244             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j])));
1245           } else {
1246             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j])));
1247           }
1248 #else
1249           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]));
1250 #endif
1251         }
1252         PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1253       }
1254     }
1255     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1256   }
1257   PetscCall(PetscViewerFlush(viewer));
1258   PetscFunctionReturn(0);
1259 }
1260 
1261 #include <petscdraw.h>
1262 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1263 {
1264   Mat               A=(Mat)Aa;
1265   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1266   PetscInt          i,j,m=A->rmap->n,shift;
1267   int               color;
1268   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1269   PetscViewer       viewer;
1270   PetscViewerFormat format;
1271 
1272   PetscFunctionBegin;
1273   PetscCall(PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer));
1274   PetscCall(PetscViewerGetFormat(viewer,&format));
1275   PetscCall(PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr));
1276 
1277   /* loop over matrix elements drawing boxes */
1278 
1279   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1280     PetscDrawCollectiveBegin(draw);
1281     /* Blue for negative, Cyan for zero and  Red for positive */
1282     color = PETSC_DRAW_BLUE;
1283     for (i=0; i<m; i++) {
1284       shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1285       y_l = m - i - 1.0; y_r = y_l + 1.0;
1286       for (j=0; j<a->rlen[i]; j++) {
1287         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1288         if (PetscRealPart(a->val[shift+8*j]) >=  0.) continue;
1289         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1290       }
1291     }
1292     color = PETSC_DRAW_CYAN;
1293     for (i=0; i<m; i++) {
1294       shift = a->sliidx[i>>3]+(i&0x07);
1295       y_l = m - i - 1.0; y_r = y_l + 1.0;
1296       for (j=0; j<a->rlen[i]; j++) {
1297         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1298         if (a->val[shift+8*j] !=  0.) continue;
1299         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1300       }
1301     }
1302     color = PETSC_DRAW_RED;
1303     for (i=0; i<m; i++) {
1304       shift = a->sliidx[i>>3]+(i&0x07);
1305       y_l = m - i - 1.0; y_r = y_l + 1.0;
1306       for (j=0; j<a->rlen[i]; j++) {
1307         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1308         if (PetscRealPart(a->val[shift+8*j]) <=  0.) continue;
1309         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1310       }
1311     }
1312     PetscDrawCollectiveEnd(draw);
1313   } else {
1314     /* use contour shading to indicate magnitude of values */
1315     /* first determine max of all nonzero values */
1316     PetscReal minv=0.0,maxv=0.0;
1317     PetscInt  count=0;
1318     PetscDraw popup;
1319     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1320       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1321     }
1322     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1323     PetscCall(PetscDrawGetPopup(draw,&popup));
1324     PetscCall(PetscDrawScalePopup(popup,minv,maxv));
1325 
1326     PetscDrawCollectiveBegin(draw);
1327     for (i=0; i<m; i++) {
1328       shift = a->sliidx[i>>3]+(i&0x07);
1329       y_l = m - i - 1.0;
1330       y_r = y_l + 1.0;
1331       for (j=0; j<a->rlen[i]; j++) {
1332         x_l = a->colidx[shift+j*8];
1333         x_r = x_l + 1.0;
1334         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1335         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1336         count++;
1337       }
1338     }
1339     PetscDrawCollectiveEnd(draw);
1340   }
1341   PetscFunctionReturn(0);
1342 }
1343 
1344 #include <petscdraw.h>
1345 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1346 {
1347   PetscDraw      draw;
1348   PetscReal      xr,yr,xl,yl,h,w;
1349   PetscBool      isnull;
1350 
1351   PetscFunctionBegin;
1352   PetscCall(PetscViewerDrawGetDraw(viewer,0,&draw));
1353   PetscCall(PetscDrawIsNull(draw,&isnull));
1354   if (isnull) PetscFunctionReturn(0);
1355 
1356   xr   = A->cmap->n; yr  = A->rmap->n; h = yr/10.0; w = xr/10.0;
1357   xr  += w;          yr += h;         xl = -w;     yl = -h;
1358   PetscCall(PetscDrawSetCoordinates(draw,xl,yl,xr,yr));
1359   PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer));
1360   PetscCall(PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A));
1361   PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL));
1362   PetscCall(PetscDrawSave(draw));
1363   PetscFunctionReturn(0);
1364 }
1365 
1366 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1367 {
1368   PetscBool      iascii,isbinary,isdraw;
1369 
1370   PetscFunctionBegin;
1371   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii));
1372   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary));
1373   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw));
1374   if (iascii) {
1375     PetscCall(MatView_SeqSELL_ASCII(A,viewer));
1376   } else if (isbinary) {
1377     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1378   } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A,viewer));
1379   PetscFunctionReturn(0);
1380 }
1381 
1382 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1383 {
1384   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1385   PetscInt       i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1386   MatScalar      *vp;
1387 
1388   PetscFunctionBegin;
1389   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1390   /* To do: compress out the unused elements */
1391   PetscCall(MatMarkDiagonal_SeqSELL(A));
1392   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz));
1393   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n",a->reallocs));
1394   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rlenmax));
1395   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1396   for (i=0; i<a->totalslices; ++i) {
1397     shift = a->sliidx[i];    /* starting index of the slice */
1398     cp    = a->colidx+shift; /* pointer to the column indices of the slice */
1399     vp    = a->val+shift;    /* pointer to the nonzero values of the slice */
1400     for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1401       row  = 8*i + row_in_slice;
1402       nrow = a->rlen[row]; /* number of nonzeros in row */
1403       /*
1404         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1405         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1406       */
1407       lastcol = 0;
1408       if (nrow>0) { /* nonempty row */
1409         lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1410       } else if (!row_in_slice) { /* first row of the currect slice is empty */
1411         for (j=1;j<8;j++) {
1412           if (a->rlen[8*i+j]) {
1413             lastcol = cp[j];
1414             break;
1415           }
1416         }
1417       } else {
1418         if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1419       }
1420 
1421       for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1422         cp[8*k+row_in_slice] = lastcol;
1423         vp[8*k+row_in_slice] = (MatScalar)0;
1424       }
1425     }
1426   }
1427 
1428   A->info.mallocs += a->reallocs;
1429   a->reallocs      = 0;
1430 
1431   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1432   PetscFunctionReturn(0);
1433 }
1434 
1435 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1436 {
1437   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1438 
1439   PetscFunctionBegin;
1440   info->block_size   = 1.0;
1441   info->nz_allocated = a->maxallocmat;
1442   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1443   info->nz_unneeded  = (a->maxallocmat-a->sliidx[a->totalslices]);
1444   info->assemblies   = A->num_ass;
1445   info->mallocs      = A->info.mallocs;
1446   info->memory       = ((PetscObject)A)->mem;
1447   if (A->factortype) {
1448     info->fill_ratio_given  = A->info.fill_ratio_given;
1449     info->fill_ratio_needed = A->info.fill_ratio_needed;
1450     info->factor_mallocs    = A->info.factor_mallocs;
1451   } else {
1452     info->fill_ratio_given  = 0;
1453     info->fill_ratio_needed = 0;
1454     info->factor_mallocs    = 0;
1455   }
1456   PetscFunctionReturn(0);
1457 }
1458 
1459 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1460 {
1461   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1462   PetscInt       shift,i,k,l,low,high,t,ii,row,col,nrow;
1463   PetscInt       *cp,nonew=a->nonew,lastcol=-1;
1464   MatScalar      *vp,value;
1465 
1466   PetscFunctionBegin;
1467   for (k=0; k<m; k++) { /* loop over added rows */
1468     row = im[k];
1469     if (row < 0) continue;
1470     PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1);
1471     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1472     cp    = a->colidx+shift; /* pointer to the row */
1473     vp    = a->val+shift; /* pointer to the row */
1474     nrow  = a->rlen[row];
1475     low   = 0;
1476     high  = nrow;
1477 
1478     for (l=0; l<n; l++) { /* loop over added columns */
1479       col = in[l];
1480       if (col<0) continue;
1481       PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1);
1482       if (a->roworiented) {
1483         value = v[l+k*n];
1484       } else {
1485         value = v[k+l*m];
1486       }
1487       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1488 
1489       /* search in this row for the specified column, i indicates the column to be set */
1490       if (col <= lastcol) low = 0;
1491       else high = nrow;
1492       lastcol = col;
1493       while (high-low > 5) {
1494         t = (low+high)/2;
1495         if (*(cp+t*8) > col) high = t;
1496         else low = t;
1497       }
1498       for (i=low; i<high; i++) {
1499         if (*(cp+i*8) > col) break;
1500         if (*(cp+i*8) == col) {
1501           if (is == ADD_VALUES) *(vp+i*8) += value;
1502           else *(vp+i*8) = value;
1503           low = i + 1;
1504           goto noinsert;
1505         }
1506       }
1507       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1508       if (nonew == 1) goto noinsert;
1509       PetscCheck(nonew != -1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1510       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1511       MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1512       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1513       for (ii=nrow-1; ii>=i; ii--) {
1514         *(cp+(ii+1)*8) = *(cp+ii*8);
1515         *(vp+(ii+1)*8) = *(vp+ii*8);
1516       }
1517       a->rlen[row]++;
1518       *(cp+i*8) = col;
1519       *(vp+i*8) = value;
1520       a->nz++;
1521       A->nonzerostate++;
1522       low = i+1; high++; nrow++;
1523 noinsert:;
1524     }
1525     a->rlen[row] = nrow;
1526   }
1527   PetscFunctionReturn(0);
1528 }
1529 
1530 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1531 {
1532   PetscFunctionBegin;
1533   /* If the two matrices have the same copy implementation, use fast copy. */
1534   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1535     Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1536     Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;
1537 
1538     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices],PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different");
1539     PetscCall(PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices]));
1540   } else {
1541     PetscCall(MatCopy_Basic(A,B,str));
1542   }
1543   PetscFunctionReturn(0);
1544 }
1545 
1546 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1547 {
1548   PetscFunctionBegin;
1549   PetscCall(MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,NULL));
1550   PetscFunctionReturn(0);
1551 }
1552 
1553 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1554 {
1555   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1556 
1557   PetscFunctionBegin;
1558   *array = a->val;
1559   PetscFunctionReturn(0);
1560 }
1561 
1562 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1563 {
1564   PetscFunctionBegin;
1565   PetscFunctionReturn(0);
1566 }
1567 
1568 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1569 {
1570   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1571   PetscInt    i;
1572   MatScalar   *aval=a->val;
1573 
1574   PetscFunctionBegin;
1575   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1576   PetscFunctionReturn(0);
1577 }
1578 
1579 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1580 {
1581   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1582   PetscInt       i;
1583   MatScalar      *aval=a->val;
1584 
1585   PetscFunctionBegin;
1586   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1587   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1588   PetscFunctionReturn(0);
1589 }
1590 
1591 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1592 {
1593   Mat_SeqSELL    *a=(Mat_SeqSELL*)inA->data;
1594   MatScalar      *aval=a->val;
1595   PetscScalar    oalpha=alpha;
1596   PetscBLASInt   one=1,size;
1597 
1598   PetscFunctionBegin;
1599   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices],&size));
1600   PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1601   PetscCall(PetscLogFlops(a->nz));
1602   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1603   PetscFunctionReturn(0);
1604 }
1605 
1606 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1607 {
1608   Mat_SeqSELL    *y=(Mat_SeqSELL*)Y->data;
1609 
1610   PetscFunctionBegin;
1611   if (!Y->preallocated || !y->nz) {
1612     PetscCall(MatSeqSELLSetPreallocation(Y,1,NULL));
1613   }
1614   PetscCall(MatShift_Basic(Y,a));
1615   PetscFunctionReturn(0);
1616 }
1617 
1618 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1619 {
1620   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1621   PetscScalar       *x,sum,*t;
1622   const MatScalar   *idiag=NULL,*mdiag;
1623   const PetscScalar *b,*xb;
1624   PetscInt          n,m=A->rmap->n,i,j,shift;
1625   const PetscInt    *diag;
1626 
1627   PetscFunctionBegin;
1628   its = its*lits;
1629 
1630   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1631   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A,omega,fshift));
1632   a->fshift = fshift;
1633   a->omega  = omega;
1634 
1635   diag  = a->diag;
1636   t     = a->ssor_work;
1637   idiag = a->idiag;
1638   mdiag = a->mdiag;
1639 
1640   PetscCall(VecGetArray(xx,&x));
1641   PetscCall(VecGetArrayRead(bb,&b));
1642   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1643   PetscCheck(flag != SOR_APPLY_UPPER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented");
1644   PetscCheck(flag != SOR_APPLY_LOWER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented");
1645   PetscCheck(!(flag & SOR_EISENSTAT),PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
1646 
1647   if (flag & SOR_ZERO_INITIAL_GUESS) {
1648     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1649       for (i=0; i<m; i++) {
1650         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1651         sum   = b[i];
1652         n     = (diag[i]-shift)/8;
1653         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1654         t[i]  = sum;
1655         x[i]  = sum*idiag[i];
1656       }
1657       xb   = t;
1658       PetscCall(PetscLogFlops(a->nz));
1659     } else xb = b;
1660     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1661       for (i=m-1; i>=0; i--) {
1662         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1663         sum   = xb[i];
1664         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1665         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1666         if (xb == b) {
1667           x[i] = sum*idiag[i];
1668         } else {
1669           x[i] = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1670         }
1671       }
1672       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1673     }
1674     its--;
1675   }
1676   while (its--) {
1677     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1678       for (i=0; i<m; i++) {
1679         /* lower */
1680         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1681         sum   = b[i];
1682         n     = (diag[i]-shift)/8;
1683         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1684         t[i]  = sum;             /* save application of the lower-triangular part */
1685         /* upper */
1686         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1687         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1688         x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1689       }
1690       xb   = t;
1691       PetscCall(PetscLogFlops(2.0*a->nz));
1692     } else xb = b;
1693     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1694       for (i=m-1; i>=0; i--) {
1695         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1696         sum = xb[i];
1697         if (xb == b) {
1698           /* whole matrix (no checkpointing available) */
1699           n     = a->rlen[i];
1700           for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1701           x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1702         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1703           n     = a->rlen[i]-(diag[i]-shift)/8-1;
1704           for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1705           x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1706         }
1707       }
1708       if (xb == b) {
1709         PetscCall(PetscLogFlops(2.0*a->nz));
1710       } else {
1711         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1712       }
1713     }
1714   }
1715   PetscCall(VecRestoreArray(xx,&x));
1716   PetscCall(VecRestoreArrayRead(bb,&b));
1717   PetscFunctionReturn(0);
1718 }
1719 
1720 /* -------------------------------------------------------------------*/
1721 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1722                                        MatGetRow_SeqSELL,
1723                                        MatRestoreRow_SeqSELL,
1724                                        MatMult_SeqSELL,
1725                                /* 4*/  MatMultAdd_SeqSELL,
1726                                        MatMultTranspose_SeqSELL,
1727                                        MatMultTransposeAdd_SeqSELL,
1728                                        NULL,
1729                                        NULL,
1730                                        NULL,
1731                                /* 10*/ NULL,
1732                                        NULL,
1733                                        NULL,
1734                                        MatSOR_SeqSELL,
1735                                        NULL,
1736                                /* 15*/ MatGetInfo_SeqSELL,
1737                                        MatEqual_SeqSELL,
1738                                        MatGetDiagonal_SeqSELL,
1739                                        MatDiagonalScale_SeqSELL,
1740                                        NULL,
1741                                /* 20*/ NULL,
1742                                        MatAssemblyEnd_SeqSELL,
1743                                        MatSetOption_SeqSELL,
1744                                        MatZeroEntries_SeqSELL,
1745                                /* 24*/ NULL,
1746                                        NULL,
1747                                        NULL,
1748                                        NULL,
1749                                        NULL,
1750                                /* 29*/ MatSetUp_SeqSELL,
1751                                        NULL,
1752                                        NULL,
1753                                        NULL,
1754                                        NULL,
1755                                /* 34*/ MatDuplicate_SeqSELL,
1756                                        NULL,
1757                                        NULL,
1758                                        NULL,
1759                                        NULL,
1760                                /* 39*/ NULL,
1761                                        NULL,
1762                                        NULL,
1763                                        MatGetValues_SeqSELL,
1764                                        MatCopy_SeqSELL,
1765                                /* 44*/ NULL,
1766                                        MatScale_SeqSELL,
1767                                        MatShift_SeqSELL,
1768                                        NULL,
1769                                        NULL,
1770                                /* 49*/ NULL,
1771                                        NULL,
1772                                        NULL,
1773                                        NULL,
1774                                        NULL,
1775                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
1776                                        NULL,
1777                                        NULL,
1778                                        NULL,
1779                                        NULL,
1780                                /* 59*/ NULL,
1781                                        MatDestroy_SeqSELL,
1782                                        MatView_SeqSELL,
1783                                        NULL,
1784                                        NULL,
1785                                /* 64*/ NULL,
1786                                        NULL,
1787                                        NULL,
1788                                        NULL,
1789                                        NULL,
1790                                /* 69*/ NULL,
1791                                        NULL,
1792                                        NULL,
1793                                        NULL,
1794                                        NULL,
1795                                /* 74*/ NULL,
1796                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1797                                        NULL,
1798                                        NULL,
1799                                        NULL,
1800                                /* 79*/ NULL,
1801                                        NULL,
1802                                        NULL,
1803                                        NULL,
1804                                        NULL,
1805                                /* 84*/ NULL,
1806                                        NULL,
1807                                        NULL,
1808                                        NULL,
1809                                        NULL,
1810                                /* 89*/ NULL,
1811                                        NULL,
1812                                        NULL,
1813                                        NULL,
1814                                        NULL,
1815                                /* 94*/ NULL,
1816                                        NULL,
1817                                        NULL,
1818                                        NULL,
1819                                        NULL,
1820                                /* 99*/ NULL,
1821                                        NULL,
1822                                        NULL,
1823                                        MatConjugate_SeqSELL,
1824                                        NULL,
1825                                /*104*/ NULL,
1826                                        NULL,
1827                                        NULL,
1828                                        NULL,
1829                                        NULL,
1830                                /*109*/ NULL,
1831                                        NULL,
1832                                        NULL,
1833                                        NULL,
1834                                        MatMissingDiagonal_SeqSELL,
1835                                /*114*/ NULL,
1836                                        NULL,
1837                                        NULL,
1838                                        NULL,
1839                                        NULL,
1840                                /*119*/ NULL,
1841                                        NULL,
1842                                        NULL,
1843                                        NULL,
1844                                        NULL,
1845                                /*124*/ NULL,
1846                                        NULL,
1847                                        NULL,
1848                                        NULL,
1849                                        NULL,
1850                                /*129*/ NULL,
1851                                        NULL,
1852                                        NULL,
1853                                        NULL,
1854                                        NULL,
1855                                /*134*/ NULL,
1856                                        NULL,
1857                                        NULL,
1858                                        NULL,
1859                                        NULL,
1860                                /*139*/ NULL,
1861                                        NULL,
1862                                        NULL,
1863                                        MatFDColoringSetUp_SeqXAIJ,
1864                                        NULL,
1865                                /*144*/ NULL,
1866                                        NULL,
1867                                        NULL,
1868                                        NULL,
1869                                        NULL,
1870                                        NULL
1871 };
1872 
1873 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1874 {
1875   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1876 
1877   PetscFunctionBegin;
1878   PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1879 
1880   /* allocate space for values if not already there */
1881   if (!a->saved_values) {
1882     PetscCall(PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values));
1883     PetscCall(PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar)));
1884   }
1885 
1886   /* copy values over */
1887   PetscCall(PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices]));
1888   PetscFunctionReturn(0);
1889 }
1890 
1891 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1892 {
1893   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1894 
1895   PetscFunctionBegin;
1896   PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1897   PetscCheck(a->saved_values,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first");
1898   PetscCall(PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices]));
1899   PetscFunctionReturn(0);
1900 }
1901 
1902 /*@C
1903  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1904 
1905  Not Collective
1906 
1907  Input Parameters:
1908  .  mat - a MATSEQSELL matrix
1909  .  array - pointer to the data
1910 
1911  Level: intermediate
1912 
1913  .seealso: `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
1914  @*/
1915 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1916 {
1917   PetscFunctionBegin;
1918   PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));
1919   PetscFunctionReturn(0);
1920 }
1921 
1922 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1923 {
1924   Mat_SeqSELL    *b;
1925   PetscMPIInt    size;
1926 
1927   PetscFunctionBegin;
1928   PetscCall(PetscCitationsRegister(citation,&cited));
1929   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size));
1930   PetscCheck(size <= 1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1");
1931 
1932   PetscCall(PetscNewLog(B,&b));
1933 
1934   B->data = (void*)b;
1935 
1936   PetscCall(PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps)));
1937 
1938   b->row                = NULL;
1939   b->col                = NULL;
1940   b->icol               = NULL;
1941   b->reallocs           = 0;
1942   b->ignorezeroentries  = PETSC_FALSE;
1943   b->roworiented        = PETSC_TRUE;
1944   b->nonew              = 0;
1945   b->diag               = NULL;
1946   b->solve_work         = NULL;
1947   B->spptr              = NULL;
1948   b->saved_values       = NULL;
1949   b->idiag              = NULL;
1950   b->mdiag              = NULL;
1951   b->ssor_work          = NULL;
1952   b->omega              = 1.0;
1953   b->fshift             = 0.0;
1954   b->idiagvalid         = PETSC_FALSE;
1955   b->keepnonzeropattern = PETSC_FALSE;
1956 
1957   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL));
1958   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL));
1959   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL));
1960   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL));
1961   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL));
1962   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL));
1963   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ));
1964   PetscFunctionReturn(0);
1965 }
1966 
1967 /*
1968  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1969  */
1970 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
1971 {
1972   Mat_SeqSELL    *c = (Mat_SeqSELL*)C->data,*a = (Mat_SeqSELL*)A->data;
1973   PetscInt       i,m=A->rmap->n;
1974   PetscInt       totalslices=a->totalslices;
1975 
1976   PetscFunctionBegin;
1977   C->factortype = A->factortype;
1978   c->row        = NULL;
1979   c->col        = NULL;
1980   c->icol       = NULL;
1981   c->reallocs   = 0;
1982   C->assembled = PETSC_TRUE;
1983 
1984   PetscCall(PetscLayoutReference(A->rmap,&C->rmap));
1985   PetscCall(PetscLayoutReference(A->cmap,&C->cmap));
1986 
1987   PetscCall(PetscMalloc1(8*totalslices,&c->rlen));
1988   PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt)));
1989   PetscCall(PetscMalloc1(totalslices+1,&c->sliidx));
1990   PetscCall(PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt)));
1991 
1992   for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
1993   for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];
1994 
1995   /* allocate the matrix space */
1996   if (mallocmatspace) {
1997     PetscCall(PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx));
1998     PetscCall(PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt))));
1999 
2000     c->singlemalloc = PETSC_TRUE;
2001 
2002     if (m > 0) {
2003       PetscCall(PetscArraycpy(c->colidx,a->colidx,a->maxallocmat));
2004       if (cpvalues == MAT_COPY_VALUES) {
2005         PetscCall(PetscArraycpy(c->val,a->val,a->maxallocmat));
2006       } else {
2007         PetscCall(PetscArrayzero(c->val,a->maxallocmat));
2008       }
2009     }
2010   }
2011 
2012   c->ignorezeroentries = a->ignorezeroentries;
2013   c->roworiented       = a->roworiented;
2014   c->nonew             = a->nonew;
2015   if (a->diag) {
2016     PetscCall(PetscMalloc1(m,&c->diag));
2017     PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt)));
2018     for (i=0; i<m; i++) {
2019       c->diag[i] = a->diag[i];
2020     }
2021   } else c->diag = NULL;
2022 
2023   c->solve_work         = NULL;
2024   c->saved_values       = NULL;
2025   c->idiag              = NULL;
2026   c->ssor_work          = NULL;
2027   c->keepnonzeropattern = a->keepnonzeropattern;
2028   c->free_val           = PETSC_TRUE;
2029   c->free_colidx        = PETSC_TRUE;
2030 
2031   c->maxallocmat  = a->maxallocmat;
2032   c->maxallocrow  = a->maxallocrow;
2033   c->rlenmax      = a->rlenmax;
2034   c->nz           = a->nz;
2035   C->preallocated = PETSC_TRUE;
2036 
2037   c->nonzerorowcnt = a->nonzerorowcnt;
2038   C->nonzerostate  = A->nonzerostate;
2039 
2040   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist));
2041   PetscFunctionReturn(0);
2042 }
2043 
2044 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2045 {
2046   PetscFunctionBegin;
2047   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
2048   PetscCall(MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n));
2049   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2050     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
2051   }
2052   PetscCall(MatSetType(*B,((PetscObject)A)->type_name));
2053   PetscCall(MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE));
2054   PetscFunctionReturn(0);
2055 }
2056 
2057 /*MC
2058    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2059    based on the sliced Ellpack format
2060 
2061    Options Database Keys:
2062 . -mat_type seqsell - sets the matrix type to "seqsell" during a call to MatSetFromOptions()
2063 
2064    Level: beginner
2065 
2066 .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2067 M*/
2068 
2069 /*MC
2070    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2071 
2072    This matrix type is identical to MATSEQSELL when constructed with a single process communicator,
2073    and MATMPISELL otherwise.  As a result, for single process communicators,
2074   MatSeqSELLSetPreallocation() is supported, and similarly MatMPISELLSetPreallocation() is supported
2075   for communicators controlling multiple processes.  It is recommended that you call both of
2076   the above preallocation routines for simplicity.
2077 
2078    Options Database Keys:
2079 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2080 
2081   Level: beginner
2082 
2083   Notes:
2084    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2085 
2086    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2087    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2088 
2089   Developer Notes:
2090    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2091 
2092    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2093 .vb
2094                             (2 0  3 4)
2095    Consider the matrix A =  (5 0  6 0)
2096                             (0 0  7 8)
2097                             (0 0  9 9)
2098 
2099    symbolically the Ellpack format can be written as
2100 
2101         (2 3 4 |)           (0 2 3 |)
2102    v =  (5 6 0 |)  colidx = (0 2 2 |)
2103         --------            ---------
2104         (7 8 |)             (2 3 |)
2105         (9 9 |)             (2 3 |)
2106 
2107     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2108     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2109     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2110 
2111     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2112 
2113 .ve
2114 
2115       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2116 
2117  References:
2118 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2119    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2120 
2121 .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2122 M*/
2123 
2124 /*@C
2125        MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2126 
2127  Collective on comm
2128 
2129  Input Parameters:
2130 +  comm - MPI communicator, set to PETSC_COMM_SELF
2131 .  m - number of rows
2132 .  n - number of columns
2133 .  rlenmax - maximum number of nonzeros in a row
2134 -  rlen - array containing the number of nonzeros in the various rows
2135  (possibly different for each row) or NULL
2136 
2137  Output Parameter:
2138 .  A - the matrix
2139 
2140  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2141  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2142  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2143 
2144  Notes:
2145  If nnz is given then nz is ignored
2146 
2147  Specify the preallocated storage with either rlenmax or rlen (not both).
2148  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2149  allocation.  For large problems you MUST preallocate memory or you
2150  will get TERRIBLE performance, see the users' manual chapter on matrices.
2151 
2152  Level: intermediate
2153 
2154  .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2155 
2156  @*/
2157 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2158 {
2159   PetscFunctionBegin;
2160   PetscCall(MatCreate(comm,A));
2161   PetscCall(MatSetSizes(*A,m,n,m,n));
2162   PetscCall(MatSetType(*A,MATSEQSELL));
2163   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen));
2164   PetscFunctionReturn(0);
2165 }
2166 
2167 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2168 {
2169   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2170   PetscInt       totalslices=a->totalslices;
2171 
2172   PetscFunctionBegin;
2173   /* If the  matrix dimensions are not equal,or no of nonzeros */
2174   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2175     *flg = PETSC_FALSE;
2176     PetscFunctionReturn(0);
2177   }
2178   /* if the a->colidx are the same */
2179   PetscCall(PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg));
2180   if (!*flg) PetscFunctionReturn(0);
2181   /* if a->val are the same */
2182   PetscCall(PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg));
2183   PetscFunctionReturn(0);
2184 }
2185 
2186 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2187 {
2188   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2189 
2190   PetscFunctionBegin;
2191   a->idiagvalid  = PETSC_FALSE;
2192   PetscFunctionReturn(0);
2193 }
2194 
2195 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2196 {
2197 #if defined(PETSC_USE_COMPLEX)
2198   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2199   PetscInt    i;
2200   PetscScalar *val = a->val;
2201 
2202   PetscFunctionBegin;
2203   for (i=0; i<a->sliidx[a->totalslices]; i++) {
2204     val[i] = PetscConj(val[i]);
2205   }
2206 #else
2207   PetscFunctionBegin;
2208 #endif
2209   PetscFunctionReturn(0);
2210 }
2211