xref: /petsc/src/mat/impls/sell/seq/sell.c (revision 40cbb1a031ea8f2be4fe2b92dc842b003ad37be3)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h>  /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited = PETSC_FALSE;
10 static const char citation[] =
11   "@inproceedings{ZhangELLPACK2018,\n"
12   " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
13   " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
14   " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
15   " year = 2018\n"
16 "}\n";
17 
18 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
19 
20   #include <immintrin.h>
21 
22   #if !defined(_MM_SCALE_8)
23   #define _MM_SCALE_8    8
24   #endif
25 
26   #if defined(__AVX512F__)
27   /* these do not work
28    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
29    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
30   */
31     #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
32     /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
33     vec_idx  = _mm256_loadu_si256((__m256i const*)acolidx); \
34     vec_vals = _mm512_loadu_pd(aval); \
35     vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
36     vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
37   #elif defined(__AVX2__) && defined(__FMA__)
38     #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
39     vec_vals = _mm256_loadu_pd(aval); \
40     vec_idx  = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \
41     vec_x    = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
42     vec_y    = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
43   #endif
44 #endif  /* PETSC_HAVE_IMMINTRIN_H */
45 
46 /*@C
47  MatSeqSELLSetPreallocation - For good matrix assembly performance
48  the user should preallocate the matrix storage by setting the parameter nz
49  (or the array nnz).  By setting these parameters accurately, performance
50  during matrix assembly can be increased significantly.
51 
52  Collective
53 
54  Input Parameters:
55  +  B - The matrix
56  .  nz - number of nonzeros per row (same for all rows)
57  -  nnz - array containing the number of nonzeros in the various rows
58  (possibly different for each row) or NULL
59 
60  Notes:
61  If nnz is given then nz is ignored.
62 
63  Specify the preallocated storage with either nz or nnz (not both).
64  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
65  allocation.  For large problems you MUST preallocate memory or you
66  will get TERRIBLE performance, see the users' manual chapter on matrices.
67 
68  You can call MatGetInfo() to get information on how effective the preallocation was;
69  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
70  You can also run with the option -info and look for messages with the string
71  malloc in them to see if additional memory allocation was needed.
72 
73  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
74  entries or columns indices.
75 
76  The maximum number of nonzeos in any row should be as accurate as possible.
77  If it is underestimated, you will get bad performance due to reallocation
78  (MatSeqXSELLReallocateSELL).
79 
80  Level: intermediate
81 
82  .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()`
83 
84  @*/
85 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
86 {
87   PetscFunctionBegin;
88   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
89   PetscValidType(B,1);
90   PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));
91   PetscFunctionReturn(0);
92 }
93 
94 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
95 {
96   Mat_SeqSELL    *b;
97   PetscInt       i,j,totalslices;
98   PetscBool      skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;
99 
100   PetscFunctionBegin;
101   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
102   if (maxallocrow == MAT_SKIP_ALLOCATION) {
103     skipallocation = PETSC_TRUE;
104     maxallocrow    = 0;
105   }
106 
107   PetscCall(PetscLayoutSetUp(B->rmap));
108   PetscCall(PetscLayoutSetUp(B->cmap));
109 
110   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
111   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
112   PetscCheck(maxallocrow >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %" PetscInt_FMT,maxallocrow);
113   if (rlen) {
114     for (i=0; i<B->rmap->n; i++) {
115       PetscCheck(rlen[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,rlen[i]);
116       PetscCheck(rlen[i] <= B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT,i,rlen[i],B->cmap->n);
117     }
118   }
119 
120   B->preallocated = PETSC_TRUE;
121 
122   b = (Mat_SeqSELL*)B->data;
123 
124   totalslices = PetscCeilInt(B->rmap->n,8);
125   b->totalslices = totalslices;
126   if (!skipallocation) {
127     if (B->rmap->n & 0x07) PetscCall(PetscInfo(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n",B->rmap->n));
128 
129     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
130       PetscCall(PetscMalloc1(totalslices+1,&b->sliidx));
131       PetscCall(PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt)));
132     }
133     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
134       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
135       else if (maxallocrow < 0) maxallocrow = 1;
136       for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
137     } else {
138       maxallocrow = 0;
139       b->sliidx[0] = 0;
140       for (i=1; i<totalslices; i++) {
141         b->sliidx[i] = 0;
142         for (j=0;j<8;j++) {
143           b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
144         }
145         maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
146         PetscCall(PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i]));
147       }
148       /* last slice */
149       b->sliidx[totalslices] = 0;
150       for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
151       maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
152       b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
153     }
154 
155     /* allocate space for val, colidx, rlen */
156     /* FIXME: should B's old memory be unlogged? */
157     PetscCall(MatSeqXSELLFreeSELL(B,&b->val,&b->colidx));
158     /* FIXME: assuming an element of the bit array takes 8 bits */
159     PetscCall(PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx));
160     PetscCall(PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt))));
161     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
162     PetscCall(PetscCalloc1(8*totalslices,&b->rlen));
163     PetscCall(PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt)));
164 
165     b->singlemalloc = PETSC_TRUE;
166     b->free_val     = PETSC_TRUE;
167     b->free_colidx  = PETSC_TRUE;
168   } else {
169     b->free_val    = PETSC_FALSE;
170     b->free_colidx = PETSC_FALSE;
171   }
172 
173   b->nz               = 0;
174   b->maxallocrow      = maxallocrow;
175   b->rlenmax          = maxallocrow;
176   b->maxallocmat      = b->sliidx[totalslices];
177   B->info.nz_unneeded = (double)b->maxallocmat;
178   if (realalloc) {
179     PetscCall(MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
180   }
181   PetscFunctionReturn(0);
182 }
183 
184 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
185 {
186   Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
187   PetscInt    shift;
188 
189   PetscFunctionBegin;
190   PetscCheck(row >= 0 && row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %" PetscInt_FMT " out of range",row);
191   if (nz) *nz = a->rlen[row];
192   shift = a->sliidx[row>>3]+(row&0x07);
193   if (!a->getrowcols) {
194 
195     PetscCall(PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals));
196   }
197   if (idx) {
198     PetscInt j;
199     for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
200     *idx = a->getrowcols;
201   }
202   if (v) {
203     PetscInt j;
204     for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
205     *v = a->getrowvals;
206   }
207   PetscFunctionReturn(0);
208 }
209 
210 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
211 {
212   PetscFunctionBegin;
213   PetscFunctionReturn(0);
214 }
215 
216 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
217 {
218   Mat            B;
219   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
220   PetscInt       i;
221 
222   PetscFunctionBegin;
223   if (reuse == MAT_REUSE_MATRIX) {
224     B    = *newmat;
225     PetscCall(MatZeroEntries(B));
226   } else {
227     PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B));
228     PetscCall(MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N));
229     PetscCall(MatSetType(B,MATSEQAIJ));
230     PetscCall(MatSeqAIJSetPreallocation(B,0,a->rlen));
231   }
232 
233   for (i=0; i<A->rmap->n; i++) {
234     PetscInt    nz = 0,*cols = NULL;
235     PetscScalar *vals = NULL;
236 
237     PetscCall(MatGetRow_SeqSELL(A,i,&nz,&cols,&vals));
238     PetscCall(MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES));
239     PetscCall(MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals));
240   }
241 
242   PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
243   PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
244   B->rmap->bs = A->rmap->bs;
245 
246   if (reuse == MAT_INPLACE_MATRIX) {
247     PetscCall(MatHeaderReplace(A,&B));
248   } else {
249     *newmat = B;
250   }
251   PetscFunctionReturn(0);
252 }
253 
254 #include <../src/mat/impls/aij/seq/aij.h>
255 
256 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
257 {
258   Mat               B;
259   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
260   PetscInt          *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
261   const PetscInt    *cols;
262   const PetscScalar *vals;
263 
264   PetscFunctionBegin;
265 
266   if (reuse == MAT_REUSE_MATRIX) {
267     B = *newmat;
268   } else {
269     if (PetscDefined(USE_DEBUG) || !a->ilen) {
270       PetscCall(PetscMalloc1(m,&rowlengths));
271       for (i=0; i<m; i++) {
272         rowlengths[i] = ai[i+1] - ai[i];
273       }
274     }
275     if (PetscDefined(USE_DEBUG) && a->ilen) {
276       PetscBool eq;
277       PetscCall(PetscMemcmp(rowlengths,a->ilen,m*sizeof(PetscInt),&eq));
278       PetscCheck(eq,PETSC_COMM_SELF,PETSC_ERR_PLIB,"SeqAIJ ilen array incorrect");
279       PetscCall(PetscFree(rowlengths));
280       rowlengths = a->ilen;
281     } else if (a->ilen) rowlengths = a->ilen;
282     PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B));
283     PetscCall(MatSetSizes(B,m,n,m,n));
284     PetscCall(MatSetType(B,MATSEQSELL));
285     PetscCall(MatSeqSELLSetPreallocation(B,0,rowlengths));
286     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
287   }
288 
289   for (row=0; row<m; row++) {
290     PetscCall(MatGetRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals));
291     PetscCall(MatSetValues_SeqSELL(B,1,&row,ncols,cols,vals,INSERT_VALUES));
292     PetscCall(MatRestoreRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals));
293   }
294   PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
295   PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
296   B->rmap->bs = A->rmap->bs;
297 
298   if (reuse == MAT_INPLACE_MATRIX) {
299     PetscCall(MatHeaderReplace(A,&B));
300   } else {
301     *newmat = B;
302   }
303   PetscFunctionReturn(0);
304 }
305 
306 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
307 {
308   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
309   PetscScalar       *y;
310   const PetscScalar *x;
311   const MatScalar   *aval=a->val;
312   PetscInt          totalslices=a->totalslices;
313   const PetscInt    *acolidx=a->colidx;
314   PetscInt          i,j;
315 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
316   __m512d           vec_x,vec_y,vec_vals;
317   __m256i           vec_idx;
318   __mmask8          mask;
319   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
320   __m256i           vec_idx2,vec_idx3,vec_idx4;
321 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
322   __m128i           vec_idx;
323   __m256d           vec_x,vec_y,vec_y2,vec_vals;
324   MatScalar         yval;
325   PetscInt          r,rows_left,row,nnz_in_row;
326 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
327   __m128d           vec_x_tmp;
328   __m256d           vec_x,vec_y,vec_y2,vec_vals;
329   MatScalar         yval;
330   PetscInt          r,rows_left,row,nnz_in_row;
331 #else
332   PetscScalar       sum[8];
333 #endif
334 
335 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
336 #pragma disjoint(*x,*y,*aval)
337 #endif
338 
339   PetscFunctionBegin;
340   PetscCall(VecGetArrayRead(xx,&x));
341   PetscCall(VecGetArray(yy,&y));
342 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
343   for (i=0; i<totalslices; i++) { /* loop over slices */
344     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
345     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
346 
347     vec_y  = _mm512_setzero_pd();
348     vec_y2 = _mm512_setzero_pd();
349     vec_y3 = _mm512_setzero_pd();
350     vec_y4 = _mm512_setzero_pd();
351 
352     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
353     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
354     case 3:
355       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
356       acolidx += 8; aval += 8;
357       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
358       acolidx += 8; aval += 8;
359       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
360       acolidx += 8; aval += 8;
361       j += 3;
362       break;
363     case 2:
364       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
365       acolidx += 8; aval += 8;
366       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
367       acolidx += 8; aval += 8;
368       j += 2;
369       break;
370     case 1:
371       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
372       acolidx += 8; aval += 8;
373       j += 1;
374       break;
375     }
376     #pragma novector
377     for (; j<(a->sliidx[i+1]>>3); j+=4) {
378       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
379       acolidx += 8; aval += 8;
380       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
381       acolidx += 8; aval += 8;
382       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
383       acolidx += 8; aval += 8;
384       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
385       acolidx += 8; aval += 8;
386     }
387 
388     vec_y = _mm512_add_pd(vec_y,vec_y2);
389     vec_y = _mm512_add_pd(vec_y,vec_y3);
390     vec_y = _mm512_add_pd(vec_y,vec_y4);
391     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
392       mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
393       _mm512_mask_storeu_pd(&y[8*i],mask,vec_y);
394     } else {
395       _mm512_storeu_pd(&y[8*i],vec_y);
396     }
397   }
398 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
399   for (i=0; i<totalslices; i++) { /* loop over full slices */
400     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
401     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
402 
403     /* last slice may have padding rows. Don't use vectorization. */
404     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
405       rows_left = A->rmap->n - 8*i;
406       for (r=0; r<rows_left; ++r) {
407         yval = (MatScalar)0;
408         row = 8*i + r;
409         nnz_in_row = a->rlen[row];
410         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
411         y[row] = yval;
412       }
413       break;
414     }
415 
416     vec_y  = _mm256_setzero_pd();
417     vec_y2 = _mm256_setzero_pd();
418 
419     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
420     #pragma novector
421     #pragma unroll(2)
422     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
423       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
424       aval += 4; acolidx += 4;
425       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
426       aval += 4; acolidx += 4;
427     }
428 
429     _mm256_storeu_pd(y+i*8,vec_y);
430     _mm256_storeu_pd(y+i*8+4,vec_y2);
431   }
432 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
433   for (i=0; i<totalslices; i++) { /* loop over full slices */
434     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
435     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
436 
437     vec_y  = _mm256_setzero_pd();
438     vec_y2 = _mm256_setzero_pd();
439 
440     /* last slice may have padding rows. Don't use vectorization. */
441     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
442       rows_left = A->rmap->n - 8*i;
443       for (r=0; r<rows_left; ++r) {
444         yval = (MatScalar)0;
445         row = 8*i + r;
446         nnz_in_row = a->rlen[row];
447         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
448         y[row] = yval;
449       }
450       break;
451     }
452 
453     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
454     #pragma novector
455     #pragma unroll(2)
456     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
457       vec_vals  = _mm256_loadu_pd(aval);
458       vec_x_tmp = _mm_setzero_pd();
459       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
460       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
461       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
462       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
463       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
464       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
465       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
466       aval     += 4;
467 
468       vec_vals  = _mm256_loadu_pd(aval);
469       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
470       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
471       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
472       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
473       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
474       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
475       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
476       aval     += 4;
477     }
478 
479     _mm256_storeu_pd(y + i*8,     vec_y);
480     _mm256_storeu_pd(y + i*8 + 4, vec_y2);
481   }
482 #else
483   for (i=0; i<totalslices; i++) { /* loop over slices */
484     for (j=0; j<8; j++) sum[j] = 0.0;
485     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
486       sum[0] += aval[j] * x[acolidx[j]];
487       sum[1] += aval[j+1] * x[acolidx[j+1]];
488       sum[2] += aval[j+2] * x[acolidx[j+2]];
489       sum[3] += aval[j+3] * x[acolidx[j+3]];
490       sum[4] += aval[j+4] * x[acolidx[j+4]];
491       sum[5] += aval[j+5] * x[acolidx[j+5]];
492       sum[6] += aval[j+6] * x[acolidx[j+6]];
493       sum[7] += aval[j+7] * x[acolidx[j+7]];
494     }
495     if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
496       for (j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
497     } else {
498       for (j=0; j<8; j++) y[8*i+j] = sum[j];
499     }
500   }
501 #endif
502 
503   PetscCall(PetscLogFlops(2.0*a->nz-a->nonzerorowcnt)); /* theoretical minimal FLOPs */
504   PetscCall(VecRestoreArrayRead(xx,&x));
505   PetscCall(VecRestoreArray(yy,&y));
506   PetscFunctionReturn(0);
507 }
508 
509 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
510 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
511 {
512   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
513   PetscScalar       *y,*z;
514   const PetscScalar *x;
515   const MatScalar   *aval=a->val;
516   PetscInt          totalslices=a->totalslices;
517   const PetscInt    *acolidx=a->colidx;
518   PetscInt          i,j;
519 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
520   __m512d           vec_x,vec_y,vec_vals;
521   __m256i           vec_idx;
522   __mmask8          mask;
523   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
524   __m256i           vec_idx2,vec_idx3,vec_idx4;
525 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
526   __m128d           vec_x_tmp;
527   __m256d           vec_x,vec_y,vec_y2,vec_vals;
528   MatScalar         yval;
529   PetscInt          r,row,nnz_in_row;
530 #else
531   PetscScalar       sum[8];
532 #endif
533 
534 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
535 #pragma disjoint(*x,*y,*aval)
536 #endif
537 
538   PetscFunctionBegin;
539   PetscCall(VecGetArrayRead(xx,&x));
540   PetscCall(VecGetArrayPair(yy,zz,&y,&z));
541 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
542   for (i=0; i<totalslices; i++) { /* loop over slices */
543     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
544     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
545 
546     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
547       mask   = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
548       vec_y  = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]);
549     } else {
550       vec_y  = _mm512_loadu_pd(&y[8*i]);
551     }
552     vec_y2 = _mm512_setzero_pd();
553     vec_y3 = _mm512_setzero_pd();
554     vec_y4 = _mm512_setzero_pd();
555 
556     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
557     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
558     case 3:
559       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
560       acolidx += 8; aval += 8;
561       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
562       acolidx += 8; aval += 8;
563       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
564       acolidx += 8; aval += 8;
565       j += 3;
566       break;
567     case 2:
568       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
569       acolidx += 8; aval += 8;
570       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
571       acolidx += 8; aval += 8;
572       j += 2;
573       break;
574     case 1:
575       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
576       acolidx += 8; aval += 8;
577       j += 1;
578       break;
579     }
580     #pragma novector
581     for (; j<(a->sliidx[i+1]>>3); j+=4) {
582       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
583       acolidx += 8; aval += 8;
584       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
585       acolidx += 8; aval += 8;
586       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
587       acolidx += 8; aval += 8;
588       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
589       acolidx += 8; aval += 8;
590     }
591 
592     vec_y = _mm512_add_pd(vec_y,vec_y2);
593     vec_y = _mm512_add_pd(vec_y,vec_y3);
594     vec_y = _mm512_add_pd(vec_y,vec_y4);
595     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
596       _mm512_mask_storeu_pd(&z[8*i],mask,vec_y);
597     } else {
598       _mm512_storeu_pd(&z[8*i],vec_y);
599     }
600   }
601 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
602   for (i=0; i<totalslices; i++) { /* loop over full slices */
603     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
604     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
605 
606     /* last slice may have padding rows. Don't use vectorization. */
607     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
608       for (r=0; r<(A->rmap->n & 0x07); ++r) {
609         row        = 8*i + r;
610         yval       = (MatScalar)0.0;
611         nnz_in_row = a->rlen[row];
612         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
613         z[row] = y[row] + yval;
614       }
615       break;
616     }
617 
618     vec_y  = _mm256_loadu_pd(y+8*i);
619     vec_y2 = _mm256_loadu_pd(y+8*i+4);
620 
621     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
622     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
623       vec_vals  = _mm256_loadu_pd(aval);
624       vec_x_tmp = _mm_setzero_pd();
625       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
626       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
627       vec_x     = _mm256_setzero_pd();
628       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
629       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
630       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
631       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
632       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
633       aval     += 4;
634 
635       vec_vals  = _mm256_loadu_pd(aval);
636       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
637       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
638       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
639       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
640       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
641       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
642       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
643       aval     += 4;
644     }
645 
646     _mm256_storeu_pd(z+i*8,vec_y);
647     _mm256_storeu_pd(z+i*8+4,vec_y2);
648   }
649 #else
650   for (i=0; i<totalslices; i++) { /* loop over slices */
651     for (j=0; j<8; j++) sum[j] = 0.0;
652     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
653       sum[0] += aval[j] * x[acolidx[j]];
654       sum[1] += aval[j+1] * x[acolidx[j+1]];
655       sum[2] += aval[j+2] * x[acolidx[j+2]];
656       sum[3] += aval[j+3] * x[acolidx[j+3]];
657       sum[4] += aval[j+4] * x[acolidx[j+4]];
658       sum[5] += aval[j+5] * x[acolidx[j+5]];
659       sum[6] += aval[j+6] * x[acolidx[j+6]];
660       sum[7] += aval[j+7] * x[acolidx[j+7]];
661     }
662     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
663       for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
664     } else {
665       for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
666     }
667   }
668 #endif
669 
670   PetscCall(PetscLogFlops(2.0*a->nz));
671   PetscCall(VecRestoreArrayRead(xx,&x));
672   PetscCall(VecRestoreArrayPair(yy,zz,&y,&z));
673   PetscFunctionReturn(0);
674 }
675 
676 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
677 {
678   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
679   PetscScalar       *y;
680   const PetscScalar *x;
681   const MatScalar   *aval=a->val;
682   const PetscInt    *acolidx=a->colidx;
683   PetscInt          i,j,r,row,nnz_in_row,totalslices=a->totalslices;
684 
685 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
686 #pragma disjoint(*x,*y,*aval)
687 #endif
688 
689   PetscFunctionBegin;
690   if (A->symmetric) {
691     PetscCall(MatMultAdd_SeqSELL(A,xx,zz,yy));
692     PetscFunctionReturn(0);
693   }
694   if (zz != yy) PetscCall(VecCopy(zz,yy));
695   PetscCall(VecGetArrayRead(xx,&x));
696   PetscCall(VecGetArray(yy,&y));
697   for (i=0; i<a->totalslices; i++) { /* loop over slices */
698     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
699       for (r=0; r<(A->rmap->n & 0x07); ++r) {
700         row        = 8*i + r;
701         nnz_in_row = a->rlen[row];
702         for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
703       }
704       break;
705     }
706     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
707       y[acolidx[j]]   += aval[j] * x[8*i];
708       y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
709       y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
710       y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
711       y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
712       y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
713       y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
714       y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
715     }
716   }
717   PetscCall(PetscLogFlops(2.0*a->sliidx[a->totalslices]));
718   PetscCall(VecRestoreArrayRead(xx,&x));
719   PetscCall(VecRestoreArray(yy,&y));
720   PetscFunctionReturn(0);
721 }
722 
723 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
724 {
725   PetscFunctionBegin;
726   if (A->symmetric) {
727     PetscCall(MatMult_SeqSELL(A,xx,yy));
728   } else {
729     PetscCall(VecSet(yy,0.0));
730     PetscCall(MatMultTransposeAdd_SeqSELL(A,xx,yy,yy));
731   }
732   PetscFunctionReturn(0);
733 }
734 
735 /*
736      Checks for missing diagonals
737 */
738 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool  *missing,PetscInt *d)
739 {
740   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
741   PetscInt       *diag,i;
742 
743   PetscFunctionBegin;
744   *missing = PETSC_FALSE;
745   if (A->rmap->n > 0 && !(a->colidx)) {
746     *missing = PETSC_TRUE;
747     if (d) *d = 0;
748     PetscCall(PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n"));
749   } else {
750     diag = a->diag;
751     for (i=0; i<A->rmap->n; i++) {
752       if (diag[i] == -1) {
753         *missing = PETSC_TRUE;
754         if (d) *d = i;
755         PetscCall(PetscInfo(A,"Matrix is missing diagonal number %" PetscInt_FMT "\n",i));
756         break;
757       }
758     }
759   }
760   PetscFunctionReturn(0);
761 }
762 
763 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
764 {
765   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
766   PetscInt       i,j,m=A->rmap->n,shift;
767 
768   PetscFunctionBegin;
769   if (!a->diag) {
770     PetscCall(PetscMalloc1(m,&a->diag));
771     PetscCall(PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt)));
772     a->free_diag = PETSC_TRUE;
773   }
774   for (i=0; i<m; i++) { /* loop over rows */
775     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
776     a->diag[i] = -1;
777     for (j=0; j<a->rlen[i]; j++) {
778       if (a->colidx[shift+j*8] == i) {
779         a->diag[i] = shift+j*8;
780         break;
781       }
782     }
783   }
784   PetscFunctionReturn(0);
785 }
786 
787 /*
788   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
789 */
790 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
791 {
792   Mat_SeqSELL    *a=(Mat_SeqSELL*) A->data;
793   PetscInt       i,*diag,m = A->rmap->n;
794   MatScalar      *val = a->val;
795   PetscScalar    *idiag,*mdiag;
796 
797   PetscFunctionBegin;
798   if (a->idiagvalid) PetscFunctionReturn(0);
799   PetscCall(MatMarkDiagonal_SeqSELL(A));
800   diag = a->diag;
801   if (!a->idiag) {
802     PetscCall(PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work));
803     PetscCall(PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar)));
804     val  = a->val;
805   }
806   mdiag = a->mdiag;
807   idiag = a->idiag;
808 
809   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
810     for (i=0; i<m; i++) {
811       mdiag[i] = val[diag[i]];
812       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
813         if (PetscRealPart(fshift)) {
814           PetscCall(PetscInfo(A,"Zero diagonal on row %" PetscInt_FMT "\n",i));
815           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
816           A->factorerror_zeropivot_value = 0.0;
817           A->factorerror_zeropivot_row   = i;
818         } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %" PetscInt_FMT,i);
819       }
820       idiag[i] = 1.0/val[diag[i]];
821     }
822     PetscCall(PetscLogFlops(m));
823   } else {
824     for (i=0; i<m; i++) {
825       mdiag[i] = val[diag[i]];
826       idiag[i] = omega/(fshift + val[diag[i]]);
827     }
828     PetscCall(PetscLogFlops(2.0*m));
829   }
830   a->idiagvalid = PETSC_TRUE;
831   PetscFunctionReturn(0);
832 }
833 
834 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
835 {
836   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
837 
838   PetscFunctionBegin;
839   PetscCall(PetscArrayzero(a->val,a->sliidx[a->totalslices]));
840   PetscCall(MatSeqSELLInvalidateDiagonal(A));
841   PetscFunctionReturn(0);
842 }
843 
844 PetscErrorCode MatDestroy_SeqSELL(Mat A)
845 {
846   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
847 
848   PetscFunctionBegin;
849 #if defined(PETSC_USE_LOG)
850   PetscLogObjectState((PetscObject)A,"Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT,A->rmap->n,A->cmap->n,a->nz);
851 #endif
852   PetscCall(MatSeqXSELLFreeSELL(A,&a->val,&a->colidx));
853   PetscCall(ISDestroy(&a->row));
854   PetscCall(ISDestroy(&a->col));
855   PetscCall(PetscFree(a->diag));
856   PetscCall(PetscFree(a->rlen));
857   PetscCall(PetscFree(a->sliidx));
858   PetscCall(PetscFree3(a->idiag,a->mdiag,a->ssor_work));
859   PetscCall(PetscFree(a->solve_work));
860   PetscCall(ISDestroy(&a->icol));
861   PetscCall(PetscFree(a->saved_values));
862   PetscCall(PetscFree2(a->getrowcols,a->getrowvals));
863 
864   PetscCall(PetscFree(A->data));
865 
866   PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL));
867   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL));
868   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL));
869   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL));
870   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLGetArray_C",NULL));
871   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLRestoreArray_C",NULL));
872   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqsell_seqaij_C",NULL));
873   PetscFunctionReturn(0);
874 }
875 
876 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
877 {
878   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
879 
880   PetscFunctionBegin;
881   switch (op) {
882   case MAT_ROW_ORIENTED:
883     a->roworiented = flg;
884     break;
885   case MAT_KEEP_NONZERO_PATTERN:
886     a->keepnonzeropattern = flg;
887     break;
888   case MAT_NEW_NONZERO_LOCATIONS:
889     a->nonew = (flg ? 0 : 1);
890     break;
891   case MAT_NEW_NONZERO_LOCATION_ERR:
892     a->nonew = (flg ? -1 : 0);
893     break;
894   case MAT_NEW_NONZERO_ALLOCATION_ERR:
895     a->nonew = (flg ? -2 : 0);
896     break;
897   case MAT_UNUSED_NONZERO_LOCATION_ERR:
898     a->nounused = (flg ? -1 : 0);
899     break;
900   case MAT_FORCE_DIAGONAL_ENTRIES:
901   case MAT_IGNORE_OFF_PROC_ENTRIES:
902   case MAT_USE_HASH_TABLE:
903   case MAT_SORTED_FULL:
904     PetscCall(PetscInfo(A,"Option %s ignored\n",MatOptions[op]));
905     break;
906   case MAT_SPD:
907   case MAT_SYMMETRIC:
908   case MAT_STRUCTURALLY_SYMMETRIC:
909   case MAT_HERMITIAN:
910   case MAT_SYMMETRY_ETERNAL:
911     /* These options are handled directly by MatSetOption() */
912     break;
913   default:
914     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
915   }
916   PetscFunctionReturn(0);
917 }
918 
919 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
920 {
921   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
922   PetscInt       i,j,n,shift;
923   PetscScalar    *x,zero=0.0;
924 
925   PetscFunctionBegin;
926   PetscCall(VecGetLocalSize(v,&n));
927   PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
928 
929   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
930     PetscInt *diag=a->diag;
931     PetscCall(VecGetArray(v,&x));
932     for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
933     PetscCall(VecRestoreArray(v,&x));
934     PetscFunctionReturn(0);
935   }
936 
937   PetscCall(VecSet(v,zero));
938   PetscCall(VecGetArray(v,&x));
939   for (i=0; i<n; i++) { /* loop over rows */
940     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
941     x[i] = 0;
942     for (j=0; j<a->rlen[i]; j++) {
943       if (a->colidx[shift+j*8] == i) {
944         x[i] = a->val[shift+j*8];
945         break;
946       }
947     }
948   }
949   PetscCall(VecRestoreArray(v,&x));
950   PetscFunctionReturn(0);
951 }
952 
953 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
954 {
955   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
956   const PetscScalar *l,*r;
957   PetscInt          i,j,m,n,row;
958 
959   PetscFunctionBegin;
960   if (ll) {
961     /* The local size is used so that VecMPI can be passed to this routine
962        by MatDiagonalScale_MPISELL */
963     PetscCall(VecGetLocalSize(ll,&m));
964     PetscCheck(m == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
965     PetscCall(VecGetArrayRead(ll,&l));
966     for (i=0; i<a->totalslices; i++) { /* loop over slices */
967       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
968         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
969           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row];
970         }
971       } else {
972         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
973           a->val[j] *= l[8*i+row];
974         }
975       }
976     }
977     PetscCall(VecRestoreArrayRead(ll,&l));
978     PetscCall(PetscLogFlops(a->nz));
979   }
980   if (rr) {
981     PetscCall(VecGetLocalSize(rr,&n));
982     PetscCheck(n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
983     PetscCall(VecGetArrayRead(rr,&r));
984     for (i=0; i<a->totalslices; i++) { /* loop over slices */
985       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
986         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
987           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
988         }
989       } else {
990         for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
991           a->val[j] *= r[a->colidx[j]];
992         }
993       }
994     }
995     PetscCall(VecRestoreArrayRead(rr,&r));
996     PetscCall(PetscLogFlops(a->nz));
997   }
998   PetscCall(MatSeqSELLInvalidateDiagonal(A));
999   PetscFunctionReturn(0);
1000 }
1001 
1002 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
1003 {
1004   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1005   PetscInt    *cp,i,k,low,high,t,row,col,l;
1006   PetscInt    shift;
1007   MatScalar   *vp;
1008 
1009   PetscFunctionBegin;
1010   for (k=0; k<m; k++) { /* loop over requested rows */
1011     row = im[k];
1012     if (row<0) continue;
1013     PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1);
1014     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1015     cp = a->colidx+shift; /* pointer to the row */
1016     vp = a->val+shift; /* pointer to the row */
1017     for (l=0; l<n; l++) { /* loop over requested columns */
1018       col = in[l];
1019       if (col<0) continue;
1020       PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1);
1021       high = a->rlen[row]; low = 0; /* assume unsorted */
1022       while (high-low > 5) {
1023         t = (low+high)/2;
1024         if (*(cp+t*8) > col) high = t;
1025         else low = t;
1026       }
1027       for (i=low; i<high; i++) {
1028         if (*(cp+8*i) > col) break;
1029         if (*(cp+8*i) == col) {
1030           *v++ = *(vp+8*i);
1031           goto finished;
1032         }
1033       }
1034       *v++ = 0.0;
1035     finished:;
1036     }
1037   }
1038   PetscFunctionReturn(0);
1039 }
1040 
1041 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1042 {
1043   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1044   PetscInt          i,j,m=A->rmap->n,shift;
1045   const char        *name;
1046   PetscViewerFormat format;
1047 
1048   PetscFunctionBegin;
1049   PetscCall(PetscViewerGetFormat(viewer,&format));
1050   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1051     PetscInt nofinalvalue = 0;
1052     /*
1053     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1054       nofinalvalue = 1;
1055     }
1056     */
1057     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1058     PetscCall(PetscViewerASCIIPrintf(viewer,"%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n",m,A->cmap->n));
1059     PetscCall(PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %" PetscInt_FMT " \n",a->nz));
1060 #if defined(PETSC_USE_COMPLEX)
1061     PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",4);\n",a->nz+nofinalvalue));
1062 #else
1063     PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",3);\n",a->nz+nofinalvalue));
1064 #endif
1065     PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = [\n"));
1066 
1067     for (i=0; i<m; i++) {
1068       shift = a->sliidx[i>>3]+(i&0x07);
1069       for (j=0; j<a->rlen[i]; j++) {
1070 #if defined(PETSC_USE_COMPLEX)
1071         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1072 #else
1073         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]));
1074 #endif
1075       }
1076     }
1077     /*
1078     if (nofinalvalue) {
1079 #if defined(PETSC_USE_COMPLEX)
1080       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1081 #else
1082       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1083 #endif
1084     }
1085     */
1086     PetscCall(PetscObjectGetName((PetscObject)A,&name));
1087     PetscCall(PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name));
1088     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1089   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1090     PetscFunctionReturn(0);
1091   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1092     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1093     for (i=0; i<m; i++) {
1094       PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i));
1095       shift = a->sliidx[i>>3]+(i&0x07);
1096       for (j=0; j<a->rlen[i]; j++) {
1097 #if defined(PETSC_USE_COMPLEX)
1098         if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1099           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1100         } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1101           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j])));
1102         } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1103           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j])));
1104         }
1105 #else
1106         if (a->val[shift+8*j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]));
1107 #endif
1108       }
1109       PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1110     }
1111     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1112   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1113     PetscInt    cnt=0,jcnt;
1114     PetscScalar value;
1115 #if defined(PETSC_USE_COMPLEX)
1116     PetscBool   realonly=PETSC_TRUE;
1117     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1118       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1119         realonly = PETSC_FALSE;
1120         break;
1121       }
1122     }
1123 #endif
1124 
1125     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1126     for (i=0; i<m; i++) {
1127       jcnt = 0;
1128       shift = a->sliidx[i>>3]+(i&0x07);
1129       for (j=0; j<A->cmap->n; j++) {
1130         if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1131           value = a->val[cnt++];
1132           jcnt++;
1133         } else {
1134           value = 0.0;
1135         }
1136 #if defined(PETSC_USE_COMPLEX)
1137         if (realonly) {
1138           PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value)));
1139         } else {
1140           PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value)));
1141         }
1142 #else
1143         PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value));
1144 #endif
1145       }
1146       PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1147     }
1148     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1149   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1150     PetscInt fshift=1;
1151     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1152 #if defined(PETSC_USE_COMPLEX)
1153     PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n"));
1154 #else
1155     PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n"));
1156 #endif
1157     PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1158     for (i=0; i<m; i++) {
1159       shift = a->sliidx[i>>3]+(i&0x07);
1160       for (j=0; j<a->rlen[i]; j++) {
1161 #if defined(PETSC_USE_COMPLEX)
1162         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1163 #else
1164         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]));
1165 #endif
1166       }
1167     }
1168     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1169   } else if (format == PETSC_VIEWER_NATIVE) {
1170     for (i=0; i<a->totalslices; i++) { /* loop over slices */
1171       PetscInt row;
1172       PetscCall(PetscViewerASCIIPrintf(viewer,"slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n",i,a->sliidx[i],a->sliidx[i+1]));
1173       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1174 #if defined(PETSC_USE_COMPLEX)
1175         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1176           PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j])));
1177         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1178           PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j])));
1179         } else {
1180           PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j])));
1181         }
1182 #else
1183         PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)a->val[j]));
1184 #endif
1185       }
1186     }
1187   } else {
1188     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1189     if (A->factortype) {
1190       for (i=0; i<m; i++) {
1191         shift = a->sliidx[i>>3]+(i&0x07);
1192         PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i));
1193         /* L part */
1194         for (j=shift; j<a->diag[i]; j+=8) {
1195 #if defined(PETSC_USE_COMPLEX)
1196           if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1197             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j])));
1198           } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1199             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j]))));
1200           } else {
1201             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j])));
1202           }
1203 #else
1204           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j]));
1205 #endif
1206         }
1207         /* diagonal */
1208         j = a->diag[i];
1209 #if defined(PETSC_USE_COMPLEX)
1210         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1211           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j])));
1212         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1213           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j]))));
1214         } else {
1215           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j])));
1216         }
1217 #else
1218         PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)(1.0/a->val[j])));
1219 #endif
1220 
1221         /* U part */
1222         for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1223 #if defined(PETSC_USE_COMPLEX)
1224           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1225             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j])));
1226           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1227             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j]))));
1228           } else {
1229             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j])));
1230           }
1231 #else
1232           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j]));
1233 #endif
1234         }
1235         PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1236       }
1237     } else {
1238       for (i=0; i<m; i++) {
1239         shift = a->sliidx[i>>3]+(i&0x07);
1240         PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i));
1241         for (j=0; j<a->rlen[i]; j++) {
1242 #if defined(PETSC_USE_COMPLEX)
1243           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1244             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1245           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1246             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j])));
1247           } else {
1248             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j])));
1249           }
1250 #else
1251           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]));
1252 #endif
1253         }
1254         PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1255       }
1256     }
1257     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1258   }
1259   PetscCall(PetscViewerFlush(viewer));
1260   PetscFunctionReturn(0);
1261 }
1262 
1263 #include <petscdraw.h>
1264 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1265 {
1266   Mat               A=(Mat)Aa;
1267   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1268   PetscInt          i,j,m=A->rmap->n,shift;
1269   int               color;
1270   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1271   PetscViewer       viewer;
1272   PetscViewerFormat format;
1273 
1274   PetscFunctionBegin;
1275   PetscCall(PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer));
1276   PetscCall(PetscViewerGetFormat(viewer,&format));
1277   PetscCall(PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr));
1278 
1279   /* loop over matrix elements drawing boxes */
1280 
1281   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1282     PetscDrawCollectiveBegin(draw);
1283     /* Blue for negative, Cyan for zero and  Red for positive */
1284     color = PETSC_DRAW_BLUE;
1285     for (i=0; i<m; i++) {
1286       shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1287       y_l = m - i - 1.0; y_r = y_l + 1.0;
1288       for (j=0; j<a->rlen[i]; j++) {
1289         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1290         if (PetscRealPart(a->val[shift+8*j]) >=  0.) continue;
1291         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1292       }
1293     }
1294     color = PETSC_DRAW_CYAN;
1295     for (i=0; i<m; i++) {
1296       shift = a->sliidx[i>>3]+(i&0x07);
1297       y_l = m - i - 1.0; y_r = y_l + 1.0;
1298       for (j=0; j<a->rlen[i]; j++) {
1299         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1300         if (a->val[shift+8*j] !=  0.) continue;
1301         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1302       }
1303     }
1304     color = PETSC_DRAW_RED;
1305     for (i=0; i<m; i++) {
1306       shift = a->sliidx[i>>3]+(i&0x07);
1307       y_l = m - i - 1.0; y_r = y_l + 1.0;
1308       for (j=0; j<a->rlen[i]; j++) {
1309         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1310         if (PetscRealPart(a->val[shift+8*j]) <=  0.) continue;
1311         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1312       }
1313     }
1314     PetscDrawCollectiveEnd(draw);
1315   } else {
1316     /* use contour shading to indicate magnitude of values */
1317     /* first determine max of all nonzero values */
1318     PetscReal minv=0.0,maxv=0.0;
1319     PetscInt  count=0;
1320     PetscDraw popup;
1321     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1322       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1323     }
1324     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1325     PetscCall(PetscDrawGetPopup(draw,&popup));
1326     PetscCall(PetscDrawScalePopup(popup,minv,maxv));
1327 
1328     PetscDrawCollectiveBegin(draw);
1329     for (i=0; i<m; i++) {
1330       shift = a->sliidx[i>>3]+(i&0x07);
1331       y_l = m - i - 1.0;
1332       y_r = y_l + 1.0;
1333       for (j=0; j<a->rlen[i]; j++) {
1334         x_l = a->colidx[shift+j*8];
1335         x_r = x_l + 1.0;
1336         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1337         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1338         count++;
1339       }
1340     }
1341     PetscDrawCollectiveEnd(draw);
1342   }
1343   PetscFunctionReturn(0);
1344 }
1345 
1346 #include <petscdraw.h>
1347 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1348 {
1349   PetscDraw      draw;
1350   PetscReal      xr,yr,xl,yl,h,w;
1351   PetscBool      isnull;
1352 
1353   PetscFunctionBegin;
1354   PetscCall(PetscViewerDrawGetDraw(viewer,0,&draw));
1355   PetscCall(PetscDrawIsNull(draw,&isnull));
1356   if (isnull) PetscFunctionReturn(0);
1357 
1358   xr   = A->cmap->n; yr  = A->rmap->n; h = yr/10.0; w = xr/10.0;
1359   xr  += w;          yr += h;         xl = -w;     yl = -h;
1360   PetscCall(PetscDrawSetCoordinates(draw,xl,yl,xr,yr));
1361   PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer));
1362   PetscCall(PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A));
1363   PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL));
1364   PetscCall(PetscDrawSave(draw));
1365   PetscFunctionReturn(0);
1366 }
1367 
1368 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1369 {
1370   PetscBool      iascii,isbinary,isdraw;
1371 
1372   PetscFunctionBegin;
1373   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii));
1374   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary));
1375   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw));
1376   if (iascii) {
1377     PetscCall(MatView_SeqSELL_ASCII(A,viewer));
1378   } else if (isbinary) {
1379     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1380   } else if (isdraw) {
1381     PetscCall(MatView_SeqSELL_Draw(A,viewer));
1382   }
1383   PetscFunctionReturn(0);
1384 }
1385 
1386 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1387 {
1388   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1389   PetscInt       i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1390   MatScalar      *vp;
1391 
1392   PetscFunctionBegin;
1393   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1394   /* To do: compress out the unused elements */
1395   PetscCall(MatMarkDiagonal_SeqSELL(A));
1396   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz));
1397   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n",a->reallocs));
1398   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rlenmax));
1399   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1400   for (i=0; i<a->totalslices; ++i) {
1401     shift = a->sliidx[i];    /* starting index of the slice */
1402     cp    = a->colidx+shift; /* pointer to the column indices of the slice */
1403     vp    = a->val+shift;    /* pointer to the nonzero values of the slice */
1404     for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1405       row  = 8*i + row_in_slice;
1406       nrow = a->rlen[row]; /* number of nonzeros in row */
1407       /*
1408         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1409         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1410       */
1411       lastcol = 0;
1412       if (nrow>0) { /* nonempty row */
1413         lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1414       } else if (!row_in_slice) { /* first row of the currect slice is empty */
1415         for (j=1;j<8;j++) {
1416           if (a->rlen[8*i+j]) {
1417             lastcol = cp[j];
1418             break;
1419           }
1420         }
1421       } else {
1422         if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1423       }
1424 
1425       for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1426         cp[8*k+row_in_slice] = lastcol;
1427         vp[8*k+row_in_slice] = (MatScalar)0;
1428       }
1429     }
1430   }
1431 
1432   A->info.mallocs += a->reallocs;
1433   a->reallocs      = 0;
1434 
1435   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1436   PetscFunctionReturn(0);
1437 }
1438 
1439 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1440 {
1441   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1442 
1443   PetscFunctionBegin;
1444   info->block_size   = 1.0;
1445   info->nz_allocated = a->maxallocmat;
1446   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1447   info->nz_unneeded  = (a->maxallocmat-a->sliidx[a->totalslices]);
1448   info->assemblies   = A->num_ass;
1449   info->mallocs      = A->info.mallocs;
1450   info->memory       = ((PetscObject)A)->mem;
1451   if (A->factortype) {
1452     info->fill_ratio_given  = A->info.fill_ratio_given;
1453     info->fill_ratio_needed = A->info.fill_ratio_needed;
1454     info->factor_mallocs    = A->info.factor_mallocs;
1455   } else {
1456     info->fill_ratio_given  = 0;
1457     info->fill_ratio_needed = 0;
1458     info->factor_mallocs    = 0;
1459   }
1460   PetscFunctionReturn(0);
1461 }
1462 
1463 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1464 {
1465   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1466   PetscInt       shift,i,k,l,low,high,t,ii,row,col,nrow;
1467   PetscInt       *cp,nonew=a->nonew,lastcol=-1;
1468   MatScalar      *vp,value;
1469 
1470   PetscFunctionBegin;
1471   for (k=0; k<m; k++) { /* loop over added rows */
1472     row = im[k];
1473     if (row < 0) continue;
1474     PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1);
1475     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1476     cp    = a->colidx+shift; /* pointer to the row */
1477     vp    = a->val+shift; /* pointer to the row */
1478     nrow  = a->rlen[row];
1479     low   = 0;
1480     high  = nrow;
1481 
1482     for (l=0; l<n; l++) { /* loop over added columns */
1483       col = in[l];
1484       if (col<0) continue;
1485       PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1);
1486       if (a->roworiented) {
1487         value = v[l+k*n];
1488       } else {
1489         value = v[k+l*m];
1490       }
1491       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1492 
1493       /* search in this row for the specified column, i indicates the column to be set */
1494       if (col <= lastcol) low = 0;
1495       else high = nrow;
1496       lastcol = col;
1497       while (high-low > 5) {
1498         t = (low+high)/2;
1499         if (*(cp+t*8) > col) high = t;
1500         else low = t;
1501       }
1502       for (i=low; i<high; i++) {
1503         if (*(cp+i*8) > col) break;
1504         if (*(cp+i*8) == col) {
1505           if (is == ADD_VALUES) *(vp+i*8) += value;
1506           else *(vp+i*8) = value;
1507           low = i + 1;
1508           goto noinsert;
1509         }
1510       }
1511       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1512       if (nonew == 1) goto noinsert;
1513       PetscCheck(nonew != -1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1514       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1515       MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1516       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1517       for (ii=nrow-1; ii>=i; ii--) {
1518         *(cp+(ii+1)*8) = *(cp+ii*8);
1519         *(vp+(ii+1)*8) = *(vp+ii*8);
1520       }
1521       a->rlen[row]++;
1522       *(cp+i*8) = col;
1523       *(vp+i*8) = value;
1524       a->nz++;
1525       A->nonzerostate++;
1526       low = i+1; high++; nrow++;
1527 noinsert:;
1528     }
1529     a->rlen[row] = nrow;
1530   }
1531   PetscFunctionReturn(0);
1532 }
1533 
1534 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1535 {
1536   PetscFunctionBegin;
1537   /* If the two matrices have the same copy implementation, use fast copy. */
1538   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1539     Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1540     Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;
1541 
1542     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices],PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different");
1543     PetscCall(PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices]));
1544   } else {
1545     PetscCall(MatCopy_Basic(A,B,str));
1546   }
1547   PetscFunctionReturn(0);
1548 }
1549 
1550 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1551 {
1552   PetscFunctionBegin;
1553   PetscCall(MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,NULL));
1554   PetscFunctionReturn(0);
1555 }
1556 
1557 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1558 {
1559   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1560 
1561   PetscFunctionBegin;
1562   *array = a->val;
1563   PetscFunctionReturn(0);
1564 }
1565 
1566 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1567 {
1568   PetscFunctionBegin;
1569   PetscFunctionReturn(0);
1570 }
1571 
1572 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1573 {
1574   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1575   PetscInt    i;
1576   MatScalar   *aval=a->val;
1577 
1578   PetscFunctionBegin;
1579   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1580   PetscFunctionReturn(0);
1581 }
1582 
1583 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1584 {
1585   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1586   PetscInt       i;
1587   MatScalar      *aval=a->val;
1588 
1589   PetscFunctionBegin;
1590   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1591   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1592   PetscFunctionReturn(0);
1593 }
1594 
1595 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1596 {
1597   Mat_SeqSELL    *a=(Mat_SeqSELL*)inA->data;
1598   MatScalar      *aval=a->val;
1599   PetscScalar    oalpha=alpha;
1600   PetscBLASInt   one=1,size;
1601 
1602   PetscFunctionBegin;
1603   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices],&size));
1604   PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1605   PetscCall(PetscLogFlops(a->nz));
1606   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1607   PetscFunctionReturn(0);
1608 }
1609 
1610 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1611 {
1612   Mat_SeqSELL    *y=(Mat_SeqSELL*)Y->data;
1613 
1614   PetscFunctionBegin;
1615   if (!Y->preallocated || !y->nz) {
1616     PetscCall(MatSeqSELLSetPreallocation(Y,1,NULL));
1617   }
1618   PetscCall(MatShift_Basic(Y,a));
1619   PetscFunctionReturn(0);
1620 }
1621 
1622 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1623 {
1624   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1625   PetscScalar       *x,sum,*t;
1626   const MatScalar   *idiag=NULL,*mdiag;
1627   const PetscScalar *b,*xb;
1628   PetscInt          n,m=A->rmap->n,i,j,shift;
1629   const PetscInt    *diag;
1630 
1631   PetscFunctionBegin;
1632   its = its*lits;
1633 
1634   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1635   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A,omega,fshift));
1636   a->fshift = fshift;
1637   a->omega  = omega;
1638 
1639   diag  = a->diag;
1640   t     = a->ssor_work;
1641   idiag = a->idiag;
1642   mdiag = a->mdiag;
1643 
1644   PetscCall(VecGetArray(xx,&x));
1645   PetscCall(VecGetArrayRead(bb,&b));
1646   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1647   PetscCheck(flag != SOR_APPLY_UPPER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented");
1648   PetscCheck(flag != SOR_APPLY_LOWER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented");
1649   PetscCheck(!(flag & SOR_EISENSTAT),PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
1650 
1651   if (flag & SOR_ZERO_INITIAL_GUESS) {
1652     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1653       for (i=0; i<m; i++) {
1654         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1655         sum   = b[i];
1656         n     = (diag[i]-shift)/8;
1657         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1658         t[i]  = sum;
1659         x[i]  = sum*idiag[i];
1660       }
1661       xb   = t;
1662       PetscCall(PetscLogFlops(a->nz));
1663     } else xb = b;
1664     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1665       for (i=m-1; i>=0; i--) {
1666         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1667         sum   = xb[i];
1668         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1669         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1670         if (xb == b) {
1671           x[i] = sum*idiag[i];
1672         } else {
1673           x[i] = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1674         }
1675       }
1676       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1677     }
1678     its--;
1679   }
1680   while (its--) {
1681     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1682       for (i=0; i<m; i++) {
1683         /* lower */
1684         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1685         sum   = b[i];
1686         n     = (diag[i]-shift)/8;
1687         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1688         t[i]  = sum;             /* save application of the lower-triangular part */
1689         /* upper */
1690         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1691         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1692         x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1693       }
1694       xb   = t;
1695       PetscCall(PetscLogFlops(2.0*a->nz));
1696     } else xb = b;
1697     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1698       for (i=m-1; i>=0; i--) {
1699         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1700         sum = xb[i];
1701         if (xb == b) {
1702           /* whole matrix (no checkpointing available) */
1703           n     = a->rlen[i];
1704           for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1705           x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1706         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1707           n     = a->rlen[i]-(diag[i]-shift)/8-1;
1708           for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1709           x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1710         }
1711       }
1712       if (xb == b) {
1713         PetscCall(PetscLogFlops(2.0*a->nz));
1714       } else {
1715         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1716       }
1717     }
1718   }
1719   PetscCall(VecRestoreArray(xx,&x));
1720   PetscCall(VecRestoreArrayRead(bb,&b));
1721   PetscFunctionReturn(0);
1722 }
1723 
1724 /* -------------------------------------------------------------------*/
1725 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1726                                        MatGetRow_SeqSELL,
1727                                        MatRestoreRow_SeqSELL,
1728                                        MatMult_SeqSELL,
1729                                /* 4*/  MatMultAdd_SeqSELL,
1730                                        MatMultTranspose_SeqSELL,
1731                                        MatMultTransposeAdd_SeqSELL,
1732                                        NULL,
1733                                        NULL,
1734                                        NULL,
1735                                /* 10*/ NULL,
1736                                        NULL,
1737                                        NULL,
1738                                        MatSOR_SeqSELL,
1739                                        NULL,
1740                                /* 15*/ MatGetInfo_SeqSELL,
1741                                        MatEqual_SeqSELL,
1742                                        MatGetDiagonal_SeqSELL,
1743                                        MatDiagonalScale_SeqSELL,
1744                                        NULL,
1745                                /* 20*/ NULL,
1746                                        MatAssemblyEnd_SeqSELL,
1747                                        MatSetOption_SeqSELL,
1748                                        MatZeroEntries_SeqSELL,
1749                                /* 24*/ NULL,
1750                                        NULL,
1751                                        NULL,
1752                                        NULL,
1753                                        NULL,
1754                                /* 29*/ MatSetUp_SeqSELL,
1755                                        NULL,
1756                                        NULL,
1757                                        NULL,
1758                                        NULL,
1759                                /* 34*/ MatDuplicate_SeqSELL,
1760                                        NULL,
1761                                        NULL,
1762                                        NULL,
1763                                        NULL,
1764                                /* 39*/ NULL,
1765                                        NULL,
1766                                        NULL,
1767                                        MatGetValues_SeqSELL,
1768                                        MatCopy_SeqSELL,
1769                                /* 44*/ NULL,
1770                                        MatScale_SeqSELL,
1771                                        MatShift_SeqSELL,
1772                                        NULL,
1773                                        NULL,
1774                                /* 49*/ NULL,
1775                                        NULL,
1776                                        NULL,
1777                                        NULL,
1778                                        NULL,
1779                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
1780                                        NULL,
1781                                        NULL,
1782                                        NULL,
1783                                        NULL,
1784                                /* 59*/ NULL,
1785                                        MatDestroy_SeqSELL,
1786                                        MatView_SeqSELL,
1787                                        NULL,
1788                                        NULL,
1789                                /* 64*/ NULL,
1790                                        NULL,
1791                                        NULL,
1792                                        NULL,
1793                                        NULL,
1794                                /* 69*/ NULL,
1795                                        NULL,
1796                                        NULL,
1797                                        NULL,
1798                                        NULL,
1799                                /* 74*/ NULL,
1800                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1801                                        NULL,
1802                                        NULL,
1803                                        NULL,
1804                                /* 79*/ NULL,
1805                                        NULL,
1806                                        NULL,
1807                                        NULL,
1808                                        NULL,
1809                                /* 84*/ NULL,
1810                                        NULL,
1811                                        NULL,
1812                                        NULL,
1813                                        NULL,
1814                                /* 89*/ NULL,
1815                                        NULL,
1816                                        NULL,
1817                                        NULL,
1818                                        NULL,
1819                                /* 94*/ NULL,
1820                                        NULL,
1821                                        NULL,
1822                                        NULL,
1823                                        NULL,
1824                                /* 99*/ NULL,
1825                                        NULL,
1826                                        NULL,
1827                                        MatConjugate_SeqSELL,
1828                                        NULL,
1829                                /*104*/ NULL,
1830                                        NULL,
1831                                        NULL,
1832                                        NULL,
1833                                        NULL,
1834                                /*109*/ NULL,
1835                                        NULL,
1836                                        NULL,
1837                                        NULL,
1838                                        MatMissingDiagonal_SeqSELL,
1839                                /*114*/ NULL,
1840                                        NULL,
1841                                        NULL,
1842                                        NULL,
1843                                        NULL,
1844                                /*119*/ NULL,
1845                                        NULL,
1846                                        NULL,
1847                                        NULL,
1848                                        NULL,
1849                                /*124*/ NULL,
1850                                        NULL,
1851                                        NULL,
1852                                        NULL,
1853                                        NULL,
1854                                /*129*/ NULL,
1855                                        NULL,
1856                                        NULL,
1857                                        NULL,
1858                                        NULL,
1859                                /*134*/ NULL,
1860                                        NULL,
1861                                        NULL,
1862                                        NULL,
1863                                        NULL,
1864                                /*139*/ NULL,
1865                                        NULL,
1866                                        NULL,
1867                                        MatFDColoringSetUp_SeqXAIJ,
1868                                        NULL,
1869                                /*144*/ NULL,
1870                                        NULL,
1871                                        NULL,
1872                                        NULL
1873 };
1874 
1875 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1876 {
1877   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1878 
1879   PetscFunctionBegin;
1880   PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1881 
1882   /* allocate space for values if not already there */
1883   if (!a->saved_values) {
1884     PetscCall(PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values));
1885     PetscCall(PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar)));
1886   }
1887 
1888   /* copy values over */
1889   PetscCall(PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices]));
1890   PetscFunctionReturn(0);
1891 }
1892 
1893 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1894 {
1895   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1896 
1897   PetscFunctionBegin;
1898   PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1899   PetscCheck(a->saved_values,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first");
1900   PetscCall(PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices]));
1901   PetscFunctionReturn(0);
1902 }
1903 
1904 /*@C
1905  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1906 
1907  Not Collective
1908 
1909  Input Parameters:
1910  .  mat - a MATSEQSELL matrix
1911  .  array - pointer to the data
1912 
1913  Level: intermediate
1914 
1915  .seealso: `MatSeqSELLGetArray()`, `MatSeqSELLRestoreArrayF90()`
1916  @*/
1917 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1918 {
1919   PetscFunctionBegin;
1920   PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));
1921   PetscFunctionReturn(0);
1922 }
1923 
1924 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1925 {
1926   Mat_SeqSELL    *b;
1927   PetscMPIInt    size;
1928 
1929   PetscFunctionBegin;
1930   PetscCall(PetscCitationsRegister(citation,&cited));
1931   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size));
1932   PetscCheck(size <= 1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1");
1933 
1934   PetscCall(PetscNewLog(B,&b));
1935 
1936   B->data = (void*)b;
1937 
1938   PetscCall(PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps)));
1939 
1940   b->row                = NULL;
1941   b->col                = NULL;
1942   b->icol               = NULL;
1943   b->reallocs           = 0;
1944   b->ignorezeroentries  = PETSC_FALSE;
1945   b->roworiented        = PETSC_TRUE;
1946   b->nonew              = 0;
1947   b->diag               = NULL;
1948   b->solve_work         = NULL;
1949   B->spptr              = NULL;
1950   b->saved_values       = NULL;
1951   b->idiag              = NULL;
1952   b->mdiag              = NULL;
1953   b->ssor_work          = NULL;
1954   b->omega              = 1.0;
1955   b->fshift             = 0.0;
1956   b->idiagvalid         = PETSC_FALSE;
1957   b->keepnonzeropattern = PETSC_FALSE;
1958 
1959   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL));
1960   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL));
1961   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL));
1962   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL));
1963   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL));
1964   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL));
1965   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ));
1966   PetscFunctionReturn(0);
1967 }
1968 
1969 /*
1970  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1971  */
1972 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
1973 {
1974   Mat_SeqSELL    *c = (Mat_SeqSELL*)C->data,*a = (Mat_SeqSELL*)A->data;
1975   PetscInt       i,m=A->rmap->n;
1976   PetscInt       totalslices=a->totalslices;
1977 
1978   PetscFunctionBegin;
1979   C->factortype = A->factortype;
1980   c->row        = NULL;
1981   c->col        = NULL;
1982   c->icol       = NULL;
1983   c->reallocs   = 0;
1984   C->assembled = PETSC_TRUE;
1985 
1986   PetscCall(PetscLayoutReference(A->rmap,&C->rmap));
1987   PetscCall(PetscLayoutReference(A->cmap,&C->cmap));
1988 
1989   PetscCall(PetscMalloc1(8*totalslices,&c->rlen));
1990   PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt)));
1991   PetscCall(PetscMalloc1(totalslices+1,&c->sliidx));
1992   PetscCall(PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt)));
1993 
1994   for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
1995   for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];
1996 
1997   /* allocate the matrix space */
1998   if (mallocmatspace) {
1999     PetscCall(PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx));
2000     PetscCall(PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt))));
2001 
2002     c->singlemalloc = PETSC_TRUE;
2003 
2004     if (m > 0) {
2005       PetscCall(PetscArraycpy(c->colidx,a->colidx,a->maxallocmat));
2006       if (cpvalues == MAT_COPY_VALUES) {
2007         PetscCall(PetscArraycpy(c->val,a->val,a->maxallocmat));
2008       } else {
2009         PetscCall(PetscArrayzero(c->val,a->maxallocmat));
2010       }
2011     }
2012   }
2013 
2014   c->ignorezeroentries = a->ignorezeroentries;
2015   c->roworiented       = a->roworiented;
2016   c->nonew             = a->nonew;
2017   if (a->diag) {
2018     PetscCall(PetscMalloc1(m,&c->diag));
2019     PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt)));
2020     for (i=0; i<m; i++) {
2021       c->diag[i] = a->diag[i];
2022     }
2023   } else c->diag = NULL;
2024 
2025   c->solve_work         = NULL;
2026   c->saved_values       = NULL;
2027   c->idiag              = NULL;
2028   c->ssor_work          = NULL;
2029   c->keepnonzeropattern = a->keepnonzeropattern;
2030   c->free_val           = PETSC_TRUE;
2031   c->free_colidx        = PETSC_TRUE;
2032 
2033   c->maxallocmat  = a->maxallocmat;
2034   c->maxallocrow  = a->maxallocrow;
2035   c->rlenmax      = a->rlenmax;
2036   c->nz           = a->nz;
2037   C->preallocated = PETSC_TRUE;
2038 
2039   c->nonzerorowcnt = a->nonzerorowcnt;
2040   C->nonzerostate  = A->nonzerostate;
2041 
2042   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist));
2043   PetscFunctionReturn(0);
2044 }
2045 
2046 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2047 {
2048   PetscFunctionBegin;
2049   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
2050   PetscCall(MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n));
2051   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2052     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
2053   }
2054   PetscCall(MatSetType(*B,((PetscObject)A)->type_name));
2055   PetscCall(MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE));
2056   PetscFunctionReturn(0);
2057 }
2058 
2059 /*MC
2060    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2061    based on the sliced Ellpack format
2062 
2063    Options Database Keys:
2064 . -mat_type seqsell - sets the matrix type to "seqsell" during a call to MatSetFromOptions()
2065 
2066    Level: beginner
2067 
2068 .seealso: `MatCreateSeqSell()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ`
2069 M*/
2070 
2071 /*MC
2072    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2073 
2074    This matrix type is identical to MATSEQSELL when constructed with a single process communicator,
2075    and MATMPISELL otherwise.  As a result, for single process communicators,
2076   MatSeqSELLSetPreallocation() is supported, and similarly MatMPISELLSetPreallocation() is supported
2077   for communicators controlling multiple processes.  It is recommended that you call both of
2078   the above preallocation routines for simplicity.
2079 
2080    Options Database Keys:
2081 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2082 
2083   Level: beginner
2084 
2085   Notes:
2086    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2087 
2088    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2089    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2090 
2091   Developer Notes:
2092    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2093 
2094    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2095 .vb
2096                             (2 0  3 4)
2097    Consider the matrix A =  (5 0  6 0)
2098                             (0 0  7 8)
2099                             (0 0  9 9)
2100 
2101    symbolically the Ellpack format can be written as
2102 
2103         (2 3 4 |)           (0 2 3 |)
2104    v =  (5 6 0 |)  colidx = (0 2 2 |)
2105         --------            ---------
2106         (7 8 |)             (2 3 |)
2107         (9 9 |)             (2 3 |)
2108 
2109     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2110     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2111     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2112 
2113     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2114 
2115 .ve
2116 
2117       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2118 
2119  References:
2120 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2121    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2122 
2123 .seealso: `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSell()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ`
2124 M*/
2125 
2126 /*@C
2127        MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2128 
2129  Collective on comm
2130 
2131  Input Parameters:
2132 +  comm - MPI communicator, set to PETSC_COMM_SELF
2133 .  m - number of rows
2134 .  n - number of columns
2135 .  rlenmax - maximum number of nonzeros in a row
2136 -  rlen - array containing the number of nonzeros in the various rows
2137  (possibly different for each row) or NULL
2138 
2139  Output Parameter:
2140 .  A - the matrix
2141 
2142  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2143  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2144  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2145 
2146  Notes:
2147  If nnz is given then nz is ignored
2148 
2149  Specify the preallocated storage with either rlenmax or rlen (not both).
2150  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2151  allocation.  For large problems you MUST preallocate memory or you
2152  will get TERRIBLE performance, see the users' manual chapter on matrices.
2153 
2154  Level: intermediate
2155 
2156  .seealso: `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATSEQSELL`, `MATMPISELL`
2157 
2158  @*/
2159 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2160 {
2161   PetscFunctionBegin;
2162   PetscCall(MatCreate(comm,A));
2163   PetscCall(MatSetSizes(*A,m,n,m,n));
2164   PetscCall(MatSetType(*A,MATSEQSELL));
2165   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen));
2166   PetscFunctionReturn(0);
2167 }
2168 
2169 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2170 {
2171   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2172   PetscInt       totalslices=a->totalslices;
2173 
2174   PetscFunctionBegin;
2175   /* If the  matrix dimensions are not equal,or no of nonzeros */
2176   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2177     *flg = PETSC_FALSE;
2178     PetscFunctionReturn(0);
2179   }
2180   /* if the a->colidx are the same */
2181   PetscCall(PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg));
2182   if (!*flg) PetscFunctionReturn(0);
2183   /* if a->val are the same */
2184   PetscCall(PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg));
2185   PetscFunctionReturn(0);
2186 }
2187 
2188 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2189 {
2190   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2191 
2192   PetscFunctionBegin;
2193   a->idiagvalid  = PETSC_FALSE;
2194   PetscFunctionReturn(0);
2195 }
2196 
2197 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2198 {
2199 #if defined(PETSC_USE_COMPLEX)
2200   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2201   PetscInt    i;
2202   PetscScalar *val = a->val;
2203 
2204   PetscFunctionBegin;
2205   for (i=0; i<a->sliidx[a->totalslices]; i++) {
2206     val[i] = PetscConj(val[i]);
2207   }
2208 #else
2209   PetscFunctionBegin;
2210 #endif
2211   PetscFunctionReturn(0);
2212 }
2213