xref: /petsc/src/mat/impls/sell/seq/sell.c (revision ef46b1a67e276116c83b5d4ce8efc2932ea4fc0a)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h>  /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 
9 static PetscBool  cited = PETSC_FALSE;
10 static const char citation[] =
11   "@inproceedings{ZhangELLPACK2018,\n"
12   " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n"
13   " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n"
14   " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n"
15   " year = 2018\n"
16 "}\n";
17 
18 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
19 
20   #include <immintrin.h>
21 
22   #if !defined(_MM_SCALE_8)
23   #define _MM_SCALE_8    8
24   #endif
25 
26   #if defined(__AVX512F__)
27   /* these do not work
28    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
29    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
30   */
31     #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
32     /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
33     vec_idx  = _mm256_loadu_si256((__m256i const*)acolidx); \
34     vec_vals = _mm512_loadu_pd(aval); \
35     vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
36     vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
37   #elif defined(__AVX2__) && defined(__FMA__)
38     #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
39     vec_vals = _mm256_loadu_pd(aval); \
40     vec_idx  = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \
41     vec_x    = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
42     vec_y    = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
43   #endif
44 #endif  /* PETSC_HAVE_IMMINTRIN_H */
45 
46 /*@C
47  MatSeqSELLSetPreallocation - For good matrix assembly performance
48  the user should preallocate the matrix storage by setting the parameter nz
49  (or the array nnz).  By setting these parameters accurately, performance
50  during matrix assembly can be increased significantly.
51 
52  Collective
53 
54  Input Parameters:
55  +  B - The matrix
56  .  nz - number of nonzeros per row (same for all rows)
57  -  nnz - array containing the number of nonzeros in the various rows
58  (possibly different for each row) or NULL
59 
60  Notes:
61  If nnz is given then nz is ignored.
62 
63  Specify the preallocated storage with either nz or nnz (not both).
64  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
65  allocation.  For large problems you MUST preallocate memory or you
66  will get TERRIBLE performance, see the users' manual chapter on matrices.
67 
68  You can call MatGetInfo() to get information on how effective the preallocation was;
69  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
70  You can also run with the option -info and look for messages with the string
71  malloc in them to see if additional memory allocation was needed.
72 
73  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
74  entries or columns indices.
75 
76  The maximum number of nonzeos in any row should be as accurate as possible.
77  If it is underestimated, you will get bad performance due to reallocation
78  (MatSeqXSELLReallocateSELL).
79 
80  Level: intermediate
81 
82  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo()
83 
84  @*/
85 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
86 {
87   PetscFunctionBegin;
88   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
89   PetscValidType(B,1);
90   PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));
91   PetscFunctionReturn(0);
92 }
93 
94 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
95 {
96   Mat_SeqSELL    *b;
97   PetscInt       i,j,totalslices;
98   PetscBool      skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;
99 
100   PetscFunctionBegin;
101   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
102   if (maxallocrow == MAT_SKIP_ALLOCATION) {
103     skipallocation = PETSC_TRUE;
104     maxallocrow    = 0;
105   }
106 
107   PetscCall(PetscLayoutSetUp(B->rmap));
108   PetscCall(PetscLayoutSetUp(B->cmap));
109 
110   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
111   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
112   PetscCheck(maxallocrow >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %" PetscInt_FMT,maxallocrow);
113   if (rlen) {
114     for (i=0; i<B->rmap->n; i++) {
115       PetscCheck(rlen[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,rlen[i]);
116       PetscCheck(rlen[i] <= B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT,i,rlen[i],B->cmap->n);
117     }
118   }
119 
120   B->preallocated = PETSC_TRUE;
121 
122   b = (Mat_SeqSELL*)B->data;
123 
124   totalslices = PetscCeilInt(B->rmap->n,8);
125   b->totalslices = totalslices;
126   if (!skipallocation) {
127     if (B->rmap->n & 0x07) PetscCall(PetscInfo(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %" PetscInt_FMT ")\n",B->rmap->n));
128 
129     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
130       PetscCall(PetscMalloc1(totalslices+1,&b->sliidx));
131       PetscCall(PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt)));
132     }
133     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
134       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
135       else if (maxallocrow < 0) maxallocrow = 1;
136       for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
137     } else {
138       maxallocrow = 0;
139       b->sliidx[0] = 0;
140       for (i=1; i<totalslices; i++) {
141         b->sliidx[i] = 0;
142         for (j=0;j<8;j++) {
143           b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
144         }
145         maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
146         PetscCall(PetscIntSumError(b->sliidx[i-1],8*b->sliidx[i],&b->sliidx[i]));
147       }
148       /* last slice */
149       b->sliidx[totalslices] = 0;
150       for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
151       maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
152       b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
153     }
154 
155     /* allocate space for val, colidx, rlen */
156     /* FIXME: should B's old memory be unlogged? */
157     PetscCall(MatSeqXSELLFreeSELL(B,&b->val,&b->colidx));
158     /* FIXME: assuming an element of the bit array takes 8 bits */
159     PetscCall(PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx));
160     PetscCall(PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt))));
161     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
162     PetscCall(PetscCalloc1(8*totalslices,&b->rlen));
163     PetscCall(PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt)));
164 
165     b->singlemalloc = PETSC_TRUE;
166     b->free_val     = PETSC_TRUE;
167     b->free_colidx  = PETSC_TRUE;
168   } else {
169     b->free_val    = PETSC_FALSE;
170     b->free_colidx = PETSC_FALSE;
171   }
172 
173   b->nz               = 0;
174   b->maxallocrow      = maxallocrow;
175   b->rlenmax          = maxallocrow;
176   b->maxallocmat      = b->sliidx[totalslices];
177   B->info.nz_unneeded = (double)b->maxallocmat;
178   if (realalloc) {
179     PetscCall(MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
180   }
181   PetscFunctionReturn(0);
182 }
183 
184 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
185 {
186   Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
187   PetscInt    shift;
188 
189   PetscFunctionBegin;
190   PetscCheckFalse(row < 0 || row >= A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %" PetscInt_FMT " out of range",row);
191   if (nz) *nz = a->rlen[row];
192   shift = a->sliidx[row>>3]+(row&0x07);
193   if (!a->getrowcols) {
194 
195     PetscCall(PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals));
196   }
197   if (idx) {
198     PetscInt j;
199     for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
200     *idx = a->getrowcols;
201   }
202   if (v) {
203     PetscInt j;
204     for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
205     *v = a->getrowvals;
206   }
207   PetscFunctionReturn(0);
208 }
209 
210 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
211 {
212   PetscFunctionBegin;
213   PetscFunctionReturn(0);
214 }
215 
216 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
217 {
218   Mat            B;
219   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
220   PetscInt       i;
221 
222   PetscFunctionBegin;
223   if (reuse == MAT_REUSE_MATRIX) {
224     B    = *newmat;
225     PetscCall(MatZeroEntries(B));
226   } else {
227     PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B));
228     PetscCall(MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N));
229     PetscCall(MatSetType(B,MATSEQAIJ));
230     PetscCall(MatSeqAIJSetPreallocation(B,0,a->rlen));
231   }
232 
233   for (i=0; i<A->rmap->n; i++) {
234     PetscInt    nz = 0,*cols = NULL;
235     PetscScalar *vals = NULL;
236 
237     PetscCall(MatGetRow_SeqSELL(A,i,&nz,&cols,&vals));
238     PetscCall(MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES));
239     PetscCall(MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals));
240   }
241 
242   PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
243   PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
244   B->rmap->bs = A->rmap->bs;
245 
246   if (reuse == MAT_INPLACE_MATRIX) {
247     PetscCall(MatHeaderReplace(A,&B));
248   } else {
249     *newmat = B;
250   }
251   PetscFunctionReturn(0);
252 }
253 
254 #include <../src/mat/impls/aij/seq/aij.h>
255 
256 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
257 {
258   Mat               B;
259   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
260   PetscInt          *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
261   const PetscInt    *cols;
262   const PetscScalar *vals;
263 
264   PetscFunctionBegin;
265 
266   if (reuse == MAT_REUSE_MATRIX) {
267     B = *newmat;
268   } else {
269     if (PetscDefined(USE_DEBUG) || !a->ilen) {
270       PetscCall(PetscMalloc1(m,&rowlengths));
271       for (i=0; i<m; i++) {
272         rowlengths[i] = ai[i+1] - ai[i];
273       }
274     }
275     if (PetscDefined(USE_DEBUG) && a->ilen) {
276       PetscBool eq;
277       PetscCall(PetscMemcmp(rowlengths,a->ilen,m*sizeof(PetscInt),&eq));
278       PetscCheck(eq,PETSC_COMM_SELF,PETSC_ERR_PLIB,"SeqAIJ ilen array incorrect");
279       PetscCall(PetscFree(rowlengths));
280       rowlengths = a->ilen;
281     } else if (a->ilen) rowlengths = a->ilen;
282     PetscCall(MatCreate(PetscObjectComm((PetscObject)A),&B));
283     PetscCall(MatSetSizes(B,m,n,m,n));
284     PetscCall(MatSetType(B,MATSEQSELL));
285     PetscCall(MatSeqSELLSetPreallocation(B,0,rowlengths));
286     if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths));
287   }
288 
289   for (row=0; row<m; row++) {
290     PetscCall(MatGetRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals));
291     PetscCall(MatSetValues_SeqSELL(B,1,&row,ncols,cols,vals,INSERT_VALUES));
292     PetscCall(MatRestoreRow_SeqAIJ(A,row,&ncols,(PetscInt**)&cols,(PetscScalar**)&vals));
293   }
294   PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
295   PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
296   B->rmap->bs = A->rmap->bs;
297 
298   if (reuse == MAT_INPLACE_MATRIX) {
299     PetscCall(MatHeaderReplace(A,&B));
300   } else {
301     *newmat = B;
302   }
303   PetscFunctionReturn(0);
304 }
305 
306 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
307 {
308   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
309   PetscScalar       *y;
310   const PetscScalar *x;
311   const MatScalar   *aval=a->val;
312   PetscInt          totalslices=a->totalslices;
313   const PetscInt    *acolidx=a->colidx;
314   PetscInt          i,j;
315 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
316   __m512d           vec_x,vec_y,vec_vals;
317   __m256i           vec_idx;
318   __mmask8          mask;
319   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
320   __m256i           vec_idx2,vec_idx3,vec_idx4;
321 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
322   __m128i           vec_idx;
323   __m256d           vec_x,vec_y,vec_y2,vec_vals;
324   MatScalar         yval;
325   PetscInt          r,rows_left,row,nnz_in_row;
326 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
327   __m128d           vec_x_tmp;
328   __m256d           vec_x,vec_y,vec_y2,vec_vals;
329   MatScalar         yval;
330   PetscInt          r,rows_left,row,nnz_in_row;
331 #else
332   PetscScalar       sum[8];
333 #endif
334 
335 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
336 #pragma disjoint(*x,*y,*aval)
337 #endif
338 
339   PetscFunctionBegin;
340   PetscCall(VecGetArrayRead(xx,&x));
341   PetscCall(VecGetArray(yy,&y));
342 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
343   for (i=0; i<totalslices; i++) { /* loop over slices */
344     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
345     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
346 
347     vec_y  = _mm512_setzero_pd();
348     vec_y2 = _mm512_setzero_pd();
349     vec_y3 = _mm512_setzero_pd();
350     vec_y4 = _mm512_setzero_pd();
351 
352     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
353     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
354     case 3:
355       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
356       acolidx += 8; aval += 8;
357       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
358       acolidx += 8; aval += 8;
359       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
360       acolidx += 8; aval += 8;
361       j += 3;
362       break;
363     case 2:
364       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
365       acolidx += 8; aval += 8;
366       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
367       acolidx += 8; aval += 8;
368       j += 2;
369       break;
370     case 1:
371       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
372       acolidx += 8; aval += 8;
373       j += 1;
374       break;
375     }
376     #pragma novector
377     for (; j<(a->sliidx[i+1]>>3); j+=4) {
378       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
379       acolidx += 8; aval += 8;
380       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
381       acolidx += 8; aval += 8;
382       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
383       acolidx += 8; aval += 8;
384       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
385       acolidx += 8; aval += 8;
386     }
387 
388     vec_y = _mm512_add_pd(vec_y,vec_y2);
389     vec_y = _mm512_add_pd(vec_y,vec_y3);
390     vec_y = _mm512_add_pd(vec_y,vec_y4);
391     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
392       mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
393       _mm512_mask_storeu_pd(&y[8*i],mask,vec_y);
394     } else {
395       _mm512_storeu_pd(&y[8*i],vec_y);
396     }
397   }
398 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
399   for (i=0; i<totalslices; i++) { /* loop over full slices */
400     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
401     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
402 
403     /* last slice may have padding rows. Don't use vectorization. */
404     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
405       rows_left = A->rmap->n - 8*i;
406       for (r=0; r<rows_left; ++r) {
407         yval = (MatScalar)0;
408         row = 8*i + r;
409         nnz_in_row = a->rlen[row];
410         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
411         y[row] = yval;
412       }
413       break;
414     }
415 
416     vec_y  = _mm256_setzero_pd();
417     vec_y2 = _mm256_setzero_pd();
418 
419     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
420     #pragma novector
421     #pragma unroll(2)
422     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
423       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
424       aval += 4; acolidx += 4;
425       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
426       aval += 4; acolidx += 4;
427     }
428 
429     _mm256_storeu_pd(y+i*8,vec_y);
430     _mm256_storeu_pd(y+i*8+4,vec_y2);
431   }
432 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
433   for (i=0; i<totalslices; i++) { /* loop over full slices */
434     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
435     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
436 
437     vec_y  = _mm256_setzero_pd();
438     vec_y2 = _mm256_setzero_pd();
439 
440     /* last slice may have padding rows. Don't use vectorization. */
441     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
442       rows_left = A->rmap->n - 8*i;
443       for (r=0; r<rows_left; ++r) {
444         yval = (MatScalar)0;
445         row = 8*i + r;
446         nnz_in_row = a->rlen[row];
447         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
448         y[row] = yval;
449       }
450       break;
451     }
452 
453     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
454     #pragma novector
455     #pragma unroll(2)
456     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
457       vec_vals  = _mm256_loadu_pd(aval);
458       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
459       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
460       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
461       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
462       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
463       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
464       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
465       aval     += 4;
466 
467       vec_vals  = _mm256_loadu_pd(aval);
468       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
469       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
470       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
471       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
472       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
473       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
474       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
475       aval     += 4;
476     }
477 
478     _mm256_storeu_pd(y + i*8,     vec_y);
479     _mm256_storeu_pd(y + i*8 + 4, vec_y2);
480   }
481 #else
482   for (i=0; i<totalslices; i++) { /* loop over slices */
483     for (j=0; j<8; j++) sum[j] = 0.0;
484     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
485       sum[0] += aval[j] * x[acolidx[j]];
486       sum[1] += aval[j+1] * x[acolidx[j+1]];
487       sum[2] += aval[j+2] * x[acolidx[j+2]];
488       sum[3] += aval[j+3] * x[acolidx[j+3]];
489       sum[4] += aval[j+4] * x[acolidx[j+4]];
490       sum[5] += aval[j+5] * x[acolidx[j+5]];
491       sum[6] += aval[j+6] * x[acolidx[j+6]];
492       sum[7] += aval[j+7] * x[acolidx[j+7]];
493     }
494     if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
495       for (j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
496     } else {
497       for (j=0; j<8; j++) y[8*i+j] = sum[j];
498     }
499   }
500 #endif
501 
502   PetscCall(PetscLogFlops(2.0*a->nz-a->nonzerorowcnt)); /* theoretical minimal FLOPs */
503   PetscCall(VecRestoreArrayRead(xx,&x));
504   PetscCall(VecRestoreArray(yy,&y));
505   PetscFunctionReturn(0);
506 }
507 
508 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
509 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
510 {
511   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
512   PetscScalar       *y,*z;
513   const PetscScalar *x;
514   const MatScalar   *aval=a->val;
515   PetscInt          totalslices=a->totalslices;
516   const PetscInt    *acolidx=a->colidx;
517   PetscInt          i,j;
518 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
519   __m512d           vec_x,vec_y,vec_vals;
520   __m256i           vec_idx;
521   __mmask8          mask;
522   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
523   __m256i           vec_idx2,vec_idx3,vec_idx4;
524 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
525   __m128d           vec_x_tmp;
526   __m256d           vec_x,vec_y,vec_y2,vec_vals;
527   MatScalar         yval;
528   PetscInt          r,row,nnz_in_row;
529 #else
530   PetscScalar       sum[8];
531 #endif
532 
533 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
534 #pragma disjoint(*x,*y,*aval)
535 #endif
536 
537   PetscFunctionBegin;
538   PetscCall(VecGetArrayRead(xx,&x));
539   PetscCall(VecGetArrayPair(yy,zz,&y,&z));
540 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
541   for (i=0; i<totalslices; i++) { /* loop over slices */
542     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
543     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
544 
545     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
546       mask   = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
547       vec_y  = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]);
548     } else {
549       vec_y  = _mm512_loadu_pd(&y[8*i]);
550     }
551     vec_y2 = _mm512_setzero_pd();
552     vec_y3 = _mm512_setzero_pd();
553     vec_y4 = _mm512_setzero_pd();
554 
555     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
556     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
557     case 3:
558       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
559       acolidx += 8; aval += 8;
560       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
561       acolidx += 8; aval += 8;
562       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
563       acolidx += 8; aval += 8;
564       j += 3;
565       break;
566     case 2:
567       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
568       acolidx += 8; aval += 8;
569       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
570       acolidx += 8; aval += 8;
571       j += 2;
572       break;
573     case 1:
574       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
575       acolidx += 8; aval += 8;
576       j += 1;
577       break;
578     }
579     #pragma novector
580     for (; j<(a->sliidx[i+1]>>3); j+=4) {
581       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
582       acolidx += 8; aval += 8;
583       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
584       acolidx += 8; aval += 8;
585       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
586       acolidx += 8; aval += 8;
587       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
588       acolidx += 8; aval += 8;
589     }
590 
591     vec_y = _mm512_add_pd(vec_y,vec_y2);
592     vec_y = _mm512_add_pd(vec_y,vec_y3);
593     vec_y = _mm512_add_pd(vec_y,vec_y4);
594     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
595       _mm512_mask_storeu_pd(&z[8*i],mask,vec_y);
596     } else {
597       _mm512_storeu_pd(&z[8*i],vec_y);
598     }
599   }
600 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
601   for (i=0; i<totalslices; i++) { /* loop over full slices */
602     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
603     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
604 
605     /* last slice may have padding rows. Don't use vectorization. */
606     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
607       for (r=0; r<(A->rmap->n & 0x07); ++r) {
608         row        = 8*i + r;
609         yval       = (MatScalar)0.0;
610         nnz_in_row = a->rlen[row];
611         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
612         z[row] = y[row] + yval;
613       }
614       break;
615     }
616 
617     vec_y  = _mm256_loadu_pd(y+8*i);
618     vec_y2 = _mm256_loadu_pd(y+8*i+4);
619 
620     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
621     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
622       vec_vals  = _mm256_loadu_pd(aval);
623       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
624       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
625       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
626       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
627       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
628       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
629       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
630       aval     += 4;
631 
632       vec_vals  = _mm256_loadu_pd(aval);
633       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
634       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
635       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
636       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
637       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
638       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
639       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
640       aval     += 4;
641     }
642 
643     _mm256_storeu_pd(z+i*8,vec_y);
644     _mm256_storeu_pd(z+i*8+4,vec_y2);
645   }
646 #else
647   for (i=0; i<totalslices; i++) { /* loop over slices */
648     for (j=0; j<8; j++) sum[j] = 0.0;
649     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
650       sum[0] += aval[j] * x[acolidx[j]];
651       sum[1] += aval[j+1] * x[acolidx[j+1]];
652       sum[2] += aval[j+2] * x[acolidx[j+2]];
653       sum[3] += aval[j+3] * x[acolidx[j+3]];
654       sum[4] += aval[j+4] * x[acolidx[j+4]];
655       sum[5] += aval[j+5] * x[acolidx[j+5]];
656       sum[6] += aval[j+6] * x[acolidx[j+6]];
657       sum[7] += aval[j+7] * x[acolidx[j+7]];
658     }
659     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
660       for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
661     } else {
662       for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
663     }
664   }
665 #endif
666 
667   PetscCall(PetscLogFlops(2.0*a->nz));
668   PetscCall(VecRestoreArrayRead(xx,&x));
669   PetscCall(VecRestoreArrayPair(yy,zz,&y,&z));
670   PetscFunctionReturn(0);
671 }
672 
673 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
674 {
675   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
676   PetscScalar       *y;
677   const PetscScalar *x;
678   const MatScalar   *aval=a->val;
679   const PetscInt    *acolidx=a->colidx;
680   PetscInt          i,j,r,row,nnz_in_row,totalslices=a->totalslices;
681 
682 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
683 #pragma disjoint(*x,*y,*aval)
684 #endif
685 
686   PetscFunctionBegin;
687   if (A->symmetric) {
688     PetscCall(MatMultAdd_SeqSELL(A,xx,zz,yy));
689     PetscFunctionReturn(0);
690   }
691   if (zz != yy) PetscCall(VecCopy(zz,yy));
692   PetscCall(VecGetArrayRead(xx,&x));
693   PetscCall(VecGetArray(yy,&y));
694   for (i=0; i<a->totalslices; i++) { /* loop over slices */
695     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
696       for (r=0; r<(A->rmap->n & 0x07); ++r) {
697         row        = 8*i + r;
698         nnz_in_row = a->rlen[row];
699         for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
700       }
701       break;
702     }
703     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
704       y[acolidx[j]]   += aval[j] * x[8*i];
705       y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
706       y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
707       y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
708       y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
709       y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
710       y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
711       y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
712     }
713   }
714   PetscCall(PetscLogFlops(2.0*a->sliidx[a->totalslices]));
715   PetscCall(VecRestoreArrayRead(xx,&x));
716   PetscCall(VecRestoreArray(yy,&y));
717   PetscFunctionReturn(0);
718 }
719 
720 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
721 {
722   PetscFunctionBegin;
723   if (A->symmetric) {
724     PetscCall(MatMult_SeqSELL(A,xx,yy));
725   } else {
726     PetscCall(VecSet(yy,0.0));
727     PetscCall(MatMultTransposeAdd_SeqSELL(A,xx,yy,yy));
728   }
729   PetscFunctionReturn(0);
730 }
731 
732 /*
733      Checks for missing diagonals
734 */
735 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool  *missing,PetscInt *d)
736 {
737   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
738   PetscInt       *diag,i;
739 
740   PetscFunctionBegin;
741   *missing = PETSC_FALSE;
742   if (A->rmap->n > 0 && !(a->colidx)) {
743     *missing = PETSC_TRUE;
744     if (d) *d = 0;
745     PetscCall(PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n"));
746   } else {
747     diag = a->diag;
748     for (i=0; i<A->rmap->n; i++) {
749       if (diag[i] == -1) {
750         *missing = PETSC_TRUE;
751         if (d) *d = i;
752         PetscCall(PetscInfo(A,"Matrix is missing diagonal number %" PetscInt_FMT "\n",i));
753         break;
754       }
755     }
756   }
757   PetscFunctionReturn(0);
758 }
759 
760 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
761 {
762   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
763   PetscInt       i,j,m=A->rmap->n,shift;
764 
765   PetscFunctionBegin;
766   if (!a->diag) {
767     PetscCall(PetscMalloc1(m,&a->diag));
768     PetscCall(PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt)));
769     a->free_diag = PETSC_TRUE;
770   }
771   for (i=0; i<m; i++) { /* loop over rows */
772     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
773     a->diag[i] = -1;
774     for (j=0; j<a->rlen[i]; j++) {
775       if (a->colidx[shift+j*8] == i) {
776         a->diag[i] = shift+j*8;
777         break;
778       }
779     }
780   }
781   PetscFunctionReturn(0);
782 }
783 
784 /*
785   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
786 */
787 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
788 {
789   Mat_SeqSELL    *a=(Mat_SeqSELL*) A->data;
790   PetscInt       i,*diag,m = A->rmap->n;
791   MatScalar      *val = a->val;
792   PetscScalar    *idiag,*mdiag;
793 
794   PetscFunctionBegin;
795   if (a->idiagvalid) PetscFunctionReturn(0);
796   PetscCall(MatMarkDiagonal_SeqSELL(A));
797   diag = a->diag;
798   if (!a->idiag) {
799     PetscCall(PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work));
800     PetscCall(PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar)));
801     val  = a->val;
802   }
803   mdiag = a->mdiag;
804   idiag = a->idiag;
805 
806   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
807     for (i=0; i<m; i++) {
808       mdiag[i] = val[diag[i]];
809       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
810         if (PetscRealPart(fshift)) {
811           PetscCall(PetscInfo(A,"Zero diagonal on row %" PetscInt_FMT "\n",i));
812           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
813           A->factorerror_zeropivot_value = 0.0;
814           A->factorerror_zeropivot_row   = i;
815         } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %" PetscInt_FMT,i);
816       }
817       idiag[i] = 1.0/val[diag[i]];
818     }
819     PetscCall(PetscLogFlops(m));
820   } else {
821     for (i=0; i<m; i++) {
822       mdiag[i] = val[diag[i]];
823       idiag[i] = omega/(fshift + val[diag[i]]);
824     }
825     PetscCall(PetscLogFlops(2.0*m));
826   }
827   a->idiagvalid = PETSC_TRUE;
828   PetscFunctionReturn(0);
829 }
830 
831 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
832 {
833   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
834 
835   PetscFunctionBegin;
836   PetscCall(PetscArrayzero(a->val,a->sliidx[a->totalslices]));
837   PetscCall(MatSeqSELLInvalidateDiagonal(A));
838   PetscFunctionReturn(0);
839 }
840 
841 PetscErrorCode MatDestroy_SeqSELL(Mat A)
842 {
843   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
844 
845   PetscFunctionBegin;
846 #if defined(PETSC_USE_LOG)
847   PetscLogObjectState((PetscObject)A,"Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT,A->rmap->n,A->cmap->n,a->nz);
848 #endif
849   PetscCall(MatSeqXSELLFreeSELL(A,&a->val,&a->colidx));
850   PetscCall(ISDestroy(&a->row));
851   PetscCall(ISDestroy(&a->col));
852   PetscCall(PetscFree(a->diag));
853   PetscCall(PetscFree(a->rlen));
854   PetscCall(PetscFree(a->sliidx));
855   PetscCall(PetscFree3(a->idiag,a->mdiag,a->ssor_work));
856   PetscCall(PetscFree(a->solve_work));
857   PetscCall(ISDestroy(&a->icol));
858   PetscCall(PetscFree(a->saved_values));
859   PetscCall(PetscFree2(a->getrowcols,a->getrowvals));
860 
861   PetscCall(PetscFree(A->data));
862 
863   PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL));
864   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL));
865   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL));
866   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL));
867   PetscFunctionReturn(0);
868 }
869 
870 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
871 {
872   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
873 
874   PetscFunctionBegin;
875   switch (op) {
876   case MAT_ROW_ORIENTED:
877     a->roworiented = flg;
878     break;
879   case MAT_KEEP_NONZERO_PATTERN:
880     a->keepnonzeropattern = flg;
881     break;
882   case MAT_NEW_NONZERO_LOCATIONS:
883     a->nonew = (flg ? 0 : 1);
884     break;
885   case MAT_NEW_NONZERO_LOCATION_ERR:
886     a->nonew = (flg ? -1 : 0);
887     break;
888   case MAT_NEW_NONZERO_ALLOCATION_ERR:
889     a->nonew = (flg ? -2 : 0);
890     break;
891   case MAT_UNUSED_NONZERO_LOCATION_ERR:
892     a->nounused = (flg ? -1 : 0);
893     break;
894   case MAT_FORCE_DIAGONAL_ENTRIES:
895   case MAT_IGNORE_OFF_PROC_ENTRIES:
896   case MAT_USE_HASH_TABLE:
897   case MAT_SORTED_FULL:
898     PetscCall(PetscInfo(A,"Option %s ignored\n",MatOptions[op]));
899     break;
900   case MAT_SPD:
901   case MAT_SYMMETRIC:
902   case MAT_STRUCTURALLY_SYMMETRIC:
903   case MAT_HERMITIAN:
904   case MAT_SYMMETRY_ETERNAL:
905     /* These options are handled directly by MatSetOption() */
906     break;
907   default:
908     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
909   }
910   PetscFunctionReturn(0);
911 }
912 
913 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
914 {
915   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
916   PetscInt       i,j,n,shift;
917   PetscScalar    *x,zero=0.0;
918 
919   PetscFunctionBegin;
920   PetscCall(VecGetLocalSize(v,&n));
921   PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
922 
923   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
924     PetscInt *diag=a->diag;
925     PetscCall(VecGetArray(v,&x));
926     for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
927     PetscCall(VecRestoreArray(v,&x));
928     PetscFunctionReturn(0);
929   }
930 
931   PetscCall(VecSet(v,zero));
932   PetscCall(VecGetArray(v,&x));
933   for (i=0; i<n; i++) { /* loop over rows */
934     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
935     x[i] = 0;
936     for (j=0; j<a->rlen[i]; j++) {
937       if (a->colidx[shift+j*8] == i) {
938         x[i] = a->val[shift+j*8];
939         break;
940       }
941     }
942   }
943   PetscCall(VecRestoreArray(v,&x));
944   PetscFunctionReturn(0);
945 }
946 
947 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
948 {
949   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
950   const PetscScalar *l,*r;
951   PetscInt          i,j,m,n,row;
952 
953   PetscFunctionBegin;
954   if (ll) {
955     /* The local size is used so that VecMPI can be passed to this routine
956        by MatDiagonalScale_MPISELL */
957     PetscCall(VecGetLocalSize(ll,&m));
958     PetscCheck(m == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
959     PetscCall(VecGetArrayRead(ll,&l));
960     for (i=0; i<a->totalslices; i++) { /* loop over slices */
961       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
962         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
963           if (row < (A->rmap->n & 0x07)) a->val[j] *= l[8*i+row];
964         }
965       } else {
966         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
967           a->val[j] *= l[8*i+row];
968         }
969       }
970     }
971     PetscCall(VecRestoreArrayRead(ll,&l));
972     PetscCall(PetscLogFlops(a->nz));
973   }
974   if (rr) {
975     PetscCall(VecGetLocalSize(rr,&n));
976     PetscCheck(n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
977     PetscCall(VecGetArrayRead(rr,&r));
978     for (i=0; i<a->totalslices; i++) { /* loop over slices */
979       if (i == a->totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
980         for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
981           if (row < (A->rmap->n & 0x07)) a->val[j] *= r[a->colidx[j]];
982         }
983       } else {
984         for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
985           a->val[j] *= r[a->colidx[j]];
986         }
987       }
988     }
989     PetscCall(VecRestoreArrayRead(rr,&r));
990     PetscCall(PetscLogFlops(a->nz));
991   }
992   PetscCall(MatSeqSELLInvalidateDiagonal(A));
993   PetscFunctionReturn(0);
994 }
995 
996 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
997 {
998   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
999   PetscInt    *cp,i,k,low,high,t,row,col,l;
1000   PetscInt    shift;
1001   MatScalar   *vp;
1002 
1003   PetscFunctionBegin;
1004   for (k=0; k<m; k++) { /* loop over requested rows */
1005     row = im[k];
1006     if (row<0) continue;
1007     PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1);
1008     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1009     cp = a->colidx+shift; /* pointer to the row */
1010     vp = a->val+shift; /* pointer to the row */
1011     for (l=0; l<n; l++) { /* loop over requested columns */
1012       col = in[l];
1013       if (col<0) continue;
1014       PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1);
1015       high = a->rlen[row]; low = 0; /* assume unsorted */
1016       while (high-low > 5) {
1017         t = (low+high)/2;
1018         if (*(cp+t*8) > col) high = t;
1019         else low = t;
1020       }
1021       for (i=low; i<high; i++) {
1022         if (*(cp+8*i) > col) break;
1023         if (*(cp+8*i) == col) {
1024           *v++ = *(vp+8*i);
1025           goto finished;
1026         }
1027       }
1028       *v++ = 0.0;
1029     finished:;
1030     }
1031   }
1032   PetscFunctionReturn(0);
1033 }
1034 
1035 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1036 {
1037   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1038   PetscInt          i,j,m=A->rmap->n,shift;
1039   const char        *name;
1040   PetscViewerFormat format;
1041 
1042   PetscFunctionBegin;
1043   PetscCall(PetscViewerGetFormat(viewer,&format));
1044   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1045     PetscInt nofinalvalue = 0;
1046     /*
1047     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1048       nofinalvalue = 1;
1049     }
1050     */
1051     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1052     PetscCall(PetscViewerASCIIPrintf(viewer,"%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n",m,A->cmap->n));
1053     PetscCall(PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %" PetscInt_FMT " \n",a->nz));
1054 #if defined(PETSC_USE_COMPLEX)
1055     PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",4);\n",a->nz+nofinalvalue));
1056 #else
1057     PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = zeros(%" PetscInt_FMT ",3);\n",a->nz+nofinalvalue));
1058 #endif
1059     PetscCall(PetscViewerASCIIPrintf(viewer,"zzz = [\n"));
1060 
1061     for (i=0; i<m; i++) {
1062       shift = a->sliidx[i>>3]+(i&0x07);
1063       for (j=0; j<a->rlen[i]; j++) {
1064 #if defined(PETSC_USE_COMPLEX)
1065         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1066 #else
1067         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]));
1068 #endif
1069       }
1070     }
1071     /*
1072     if (nofinalvalue) {
1073 #if defined(PETSC_USE_COMPLEX)
1074       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e %18.16e\n",m,A->cmap->n,0.,0.));
1075 #else
1076       PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT "  %18.16e\n",m,A->cmap->n,0.0));
1077 #endif
1078     }
1079     */
1080     PetscCall(PetscObjectGetName((PetscObject)A,&name));
1081     PetscCall(PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name));
1082     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1083   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1084     PetscFunctionReturn(0);
1085   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1086     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1087     for (i=0; i<m; i++) {
1088       PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i));
1089       shift = a->sliidx[i>>3]+(i&0x07);
1090       for (j=0; j<a->rlen[i]; j++) {
1091 #if defined(PETSC_USE_COMPLEX)
1092         if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1093           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1094         } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1095           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j])));
1096         } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1097           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j])));
1098         }
1099 #else
1100         if (a->val[shift+8*j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]));
1101 #endif
1102       }
1103       PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1104     }
1105     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1106   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1107     PetscInt    cnt=0,jcnt;
1108     PetscScalar value;
1109 #if defined(PETSC_USE_COMPLEX)
1110     PetscBool   realonly=PETSC_TRUE;
1111     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1112       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1113         realonly = PETSC_FALSE;
1114         break;
1115       }
1116     }
1117 #endif
1118 
1119     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1120     for (i=0; i<m; i++) {
1121       jcnt = 0;
1122       shift = a->sliidx[i>>3]+(i&0x07);
1123       for (j=0; j<A->cmap->n; j++) {
1124         if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1125           value = a->val[cnt++];
1126           jcnt++;
1127         } else {
1128           value = 0.0;
1129         }
1130 #if defined(PETSC_USE_COMPLEX)
1131         if (realonly) {
1132           PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value)));
1133         } else {
1134           PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value)));
1135         }
1136 #else
1137         PetscCall(PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value));
1138 #endif
1139       }
1140       PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1141     }
1142     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1143   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1144     PetscInt fshift=1;
1145     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1146 #if defined(PETSC_USE_COMPLEX)
1147     PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n"));
1148 #else
1149     PetscCall(PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n"));
1150 #endif
1151     PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz));
1152     for (i=0; i<m; i++) {
1153       shift = a->sliidx[i>>3]+(i&0x07);
1154       for (j=0; j<a->rlen[i]; j++) {
1155 #if defined(PETSC_USE_COMPLEX)
1156         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1157 #else
1158         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]));
1159 #endif
1160       }
1161     }
1162     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1163   } else if (format == PETSC_VIEWER_NATIVE) {
1164     for (i=0; i<a->totalslices; i++) { /* loop over slices */
1165       PetscInt row;
1166       PetscCall(PetscViewerASCIIPrintf(viewer,"slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n",i,a->sliidx[i],a->sliidx[i+1]));
1167       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1168 #if defined(PETSC_USE_COMPLEX)
1169         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1170           PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j])));
1171         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1172           PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j])));
1173         } else {
1174           PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j])));
1175         }
1176 #else
1177         PetscCall(PetscViewerASCIIPrintf(viewer,"  %" PetscInt_FMT " %" PetscInt_FMT " %g\n",8*i+row,a->colidx[j],(double)a->val[j]));
1178 #endif
1179       }
1180     }
1181   } else {
1182     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_FALSE));
1183     if (A->factortype) {
1184       for (i=0; i<m; i++) {
1185         shift = a->sliidx[i>>3]+(i&0x07);
1186         PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i));
1187         /* L part */
1188         for (j=shift; j<a->diag[i]; j+=8) {
1189 #if defined(PETSC_USE_COMPLEX)
1190           if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1191             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j])));
1192           } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1193             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j]))));
1194           } else {
1195             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j])));
1196           }
1197 #else
1198           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j]));
1199 #endif
1200         }
1201         /* diagonal */
1202         j = a->diag[i];
1203 #if defined(PETSC_USE_COMPLEX)
1204         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1205           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j])));
1206         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1207           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j]))));
1208         } else {
1209           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j])));
1210         }
1211 #else
1212         PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)(1.0/a->val[j])));
1213 #endif
1214 
1215         /* U part */
1216         for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1217 #if defined(PETSC_USE_COMPLEX)
1218           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1219             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j])));
1220           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1221             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j]))));
1222           } else {
1223             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)PetscRealPart(a->val[j])));
1224           }
1225 #else
1226           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[j],(double)a->val[j]));
1227 #endif
1228         }
1229         PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1230       }
1231     } else {
1232       for (i=0; i<m; i++) {
1233         shift = a->sliidx[i>>3]+(i&0x07);
1234         PetscCall(PetscViewerASCIIPrintf(viewer,"row %" PetscInt_FMT ":",i));
1235         for (j=0; j<a->rlen[i]; j++) {
1236 #if defined(PETSC_USE_COMPLEX)
1237           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1238             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j])));
1239           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1240             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j])));
1241           } else {
1242             PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j])));
1243           }
1244 #else
1245           PetscCall(PetscViewerASCIIPrintf(viewer," (%" PetscInt_FMT ", %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]));
1246 #endif
1247         }
1248         PetscCall(PetscViewerASCIIPrintf(viewer,"\n"));
1249       }
1250     }
1251     PetscCall(PetscViewerASCIIUseTabs(viewer,PETSC_TRUE));
1252   }
1253   PetscCall(PetscViewerFlush(viewer));
1254   PetscFunctionReturn(0);
1255 }
1256 
1257 #include <petscdraw.h>
1258 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1259 {
1260   Mat               A=(Mat)Aa;
1261   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1262   PetscInt          i,j,m=A->rmap->n,shift;
1263   int               color;
1264   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1265   PetscViewer       viewer;
1266   PetscViewerFormat format;
1267 
1268   PetscFunctionBegin;
1269   PetscCall(PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer));
1270   PetscCall(PetscViewerGetFormat(viewer,&format));
1271   PetscCall(PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr));
1272 
1273   /* loop over matrix elements drawing boxes */
1274 
1275   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1276     PetscDrawCollectiveBegin(draw);
1277     /* Blue for negative, Cyan for zero and  Red for positive */
1278     color = PETSC_DRAW_BLUE;
1279     for (i=0; i<m; i++) {
1280       shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1281       y_l = m - i - 1.0; y_r = y_l + 1.0;
1282       for (j=0; j<a->rlen[i]; j++) {
1283         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1284         if (PetscRealPart(a->val[shift+8*j]) >=  0.) continue;
1285         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1286       }
1287     }
1288     color = PETSC_DRAW_CYAN;
1289     for (i=0; i<m; i++) {
1290       shift = a->sliidx[i>>3]+(i&0x07);
1291       y_l = m - i - 1.0; y_r = y_l + 1.0;
1292       for (j=0; j<a->rlen[i]; j++) {
1293         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1294         if (a->val[shift+8*j] !=  0.) continue;
1295         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1296       }
1297     }
1298     color = PETSC_DRAW_RED;
1299     for (i=0; i<m; i++) {
1300       shift = a->sliidx[i>>3]+(i&0x07);
1301       y_l = m - i - 1.0; y_r = y_l + 1.0;
1302       for (j=0; j<a->rlen[i]; j++) {
1303         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1304         if (PetscRealPart(a->val[shift+8*j]) <=  0.) continue;
1305         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1306       }
1307     }
1308     PetscDrawCollectiveEnd(draw);
1309   } else {
1310     /* use contour shading to indicate magnitude of values */
1311     /* first determine max of all nonzero values */
1312     PetscReal minv=0.0,maxv=0.0;
1313     PetscInt  count=0;
1314     PetscDraw popup;
1315     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1316       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1317     }
1318     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1319     PetscCall(PetscDrawGetPopup(draw,&popup));
1320     PetscCall(PetscDrawScalePopup(popup,minv,maxv));
1321 
1322     PetscDrawCollectiveBegin(draw);
1323     for (i=0; i<m; i++) {
1324       shift = a->sliidx[i>>3]+(i&0x07);
1325       y_l = m - i - 1.0;
1326       y_r = y_l + 1.0;
1327       for (j=0; j<a->rlen[i]; j++) {
1328         x_l = a->colidx[shift+j*8];
1329         x_r = x_l + 1.0;
1330         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1331         PetscCall(PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color));
1332         count++;
1333       }
1334     }
1335     PetscDrawCollectiveEnd(draw);
1336   }
1337   PetscFunctionReturn(0);
1338 }
1339 
1340 #include <petscdraw.h>
1341 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1342 {
1343   PetscDraw      draw;
1344   PetscReal      xr,yr,xl,yl,h,w;
1345   PetscBool      isnull;
1346 
1347   PetscFunctionBegin;
1348   PetscCall(PetscViewerDrawGetDraw(viewer,0,&draw));
1349   PetscCall(PetscDrawIsNull(draw,&isnull));
1350   if (isnull) PetscFunctionReturn(0);
1351 
1352   xr   = A->cmap->n; yr  = A->rmap->n; h = yr/10.0; w = xr/10.0;
1353   xr  += w;          yr += h;         xl = -w;     yl = -h;
1354   PetscCall(PetscDrawSetCoordinates(draw,xl,yl,xr,yr));
1355   PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer));
1356   PetscCall(PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A));
1357   PetscCall(PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL));
1358   PetscCall(PetscDrawSave(draw));
1359   PetscFunctionReturn(0);
1360 }
1361 
1362 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1363 {
1364   PetscBool      iascii,isbinary,isdraw;
1365 
1366   PetscFunctionBegin;
1367   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii));
1368   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary));
1369   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw));
1370   if (iascii) {
1371     PetscCall(MatView_SeqSELL_ASCII(A,viewer));
1372   } else if (isbinary) {
1373     /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */
1374   } else if (isdraw) {
1375     PetscCall(MatView_SeqSELL_Draw(A,viewer));
1376   }
1377   PetscFunctionReturn(0);
1378 }
1379 
1380 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1381 {
1382   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1383   PetscInt       i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1384   MatScalar      *vp;
1385 
1386   PetscFunctionBegin;
1387   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1388   /* To do: compress out the unused elements */
1389   PetscCall(MatMarkDiagonal_SeqSELL(A));
1390   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz));
1391   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n",a->reallocs));
1392   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rlenmax));
1393   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1394   for (i=0; i<a->totalslices; ++i) {
1395     shift = a->sliidx[i];    /* starting index of the slice */
1396     cp    = a->colidx+shift; /* pointer to the column indices of the slice */
1397     vp    = a->val+shift;    /* pointer to the nonzero values of the slice */
1398     for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1399       row  = 8*i + row_in_slice;
1400       nrow = a->rlen[row]; /* number of nonzeros in row */
1401       /*
1402         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1403         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1404       */
1405       lastcol = 0;
1406       if (nrow>0) { /* nonempty row */
1407         lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1408       } else if (!row_in_slice) { /* first row of the currect slice is empty */
1409         for (j=1;j<8;j++) {
1410           if (a->rlen[8*i+j]) {
1411             lastcol = cp[j];
1412             break;
1413           }
1414         }
1415       } else {
1416         if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1417       }
1418 
1419       for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1420         cp[8*k+row_in_slice] = lastcol;
1421         vp[8*k+row_in_slice] = (MatScalar)0;
1422       }
1423     }
1424   }
1425 
1426   A->info.mallocs += a->reallocs;
1427   a->reallocs      = 0;
1428 
1429   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1430   PetscFunctionReturn(0);
1431 }
1432 
1433 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1434 {
1435   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1436 
1437   PetscFunctionBegin;
1438   info->block_size   = 1.0;
1439   info->nz_allocated = a->maxallocmat;
1440   info->nz_used      = a->sliidx[a->totalslices]; /* include padding zeros */
1441   info->nz_unneeded  = (a->maxallocmat-a->sliidx[a->totalslices]);
1442   info->assemblies   = A->num_ass;
1443   info->mallocs      = A->info.mallocs;
1444   info->memory       = ((PetscObject)A)->mem;
1445   if (A->factortype) {
1446     info->fill_ratio_given  = A->info.fill_ratio_given;
1447     info->fill_ratio_needed = A->info.fill_ratio_needed;
1448     info->factor_mallocs    = A->info.factor_mallocs;
1449   } else {
1450     info->fill_ratio_given  = 0;
1451     info->fill_ratio_needed = 0;
1452     info->factor_mallocs    = 0;
1453   }
1454   PetscFunctionReturn(0);
1455 }
1456 
1457 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1458 {
1459   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1460   PetscInt       shift,i,k,l,low,high,t,ii,row,col,nrow;
1461   PetscInt       *cp,nonew=a->nonew,lastcol=-1;
1462   MatScalar      *vp,value;
1463 
1464   PetscFunctionBegin;
1465   for (k=0; k<m; k++) { /* loop over added rows */
1466     row = im[k];
1467     if (row < 0) continue;
1468     PetscCheck(row < A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT,row,A->rmap->n-1);
1469     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1470     cp    = a->colidx+shift; /* pointer to the row */
1471     vp    = a->val+shift; /* pointer to the row */
1472     nrow  = a->rlen[row];
1473     low   = 0;
1474     high  = nrow;
1475 
1476     for (l=0; l<n; l++) { /* loop over added columns */
1477       col = in[l];
1478       if (col<0) continue;
1479       PetscCheck(col < A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT,col,A->cmap->n-1);
1480       if (a->roworiented) {
1481         value = v[l+k*n];
1482       } else {
1483         value = v[k+l*m];
1484       }
1485       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1486 
1487       /* search in this row for the specified column, i indicates the column to be set */
1488       if (col <= lastcol) low = 0;
1489       else high = nrow;
1490       lastcol = col;
1491       while (high-low > 5) {
1492         t = (low+high)/2;
1493         if (*(cp+t*8) > col) high = t;
1494         else low = t;
1495       }
1496       for (i=low; i<high; i++) {
1497         if (*(cp+i*8) > col) break;
1498         if (*(cp+i*8) == col) {
1499           if (is == ADD_VALUES) *(vp+i*8) += value;
1500           else *(vp+i*8) = value;
1501           low = i + 1;
1502           goto noinsert;
1503         }
1504       }
1505       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1506       if (nonew == 1) goto noinsert;
1507       PetscCheck(nonew != -1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col);
1508       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1509       MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1510       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1511       for (ii=nrow-1; ii>=i; ii--) {
1512         *(cp+(ii+1)*8) = *(cp+ii*8);
1513         *(vp+(ii+1)*8) = *(vp+ii*8);
1514       }
1515       a->rlen[row]++;
1516       *(cp+i*8) = col;
1517       *(vp+i*8) = value;
1518       a->nz++;
1519       A->nonzerostate++;
1520       low = i+1; high++; nrow++;
1521 noinsert:;
1522     }
1523     a->rlen[row] = nrow;
1524   }
1525   PetscFunctionReturn(0);
1526 }
1527 
1528 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1529 {
1530   PetscFunctionBegin;
1531   /* If the two matrices have the same copy implementation, use fast copy. */
1532   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1533     Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1534     Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;
1535 
1536     PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices],PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different");
1537     PetscCall(PetscArraycpy(b->val,a->val,a->sliidx[a->totalslices]));
1538   } else {
1539     PetscCall(MatCopy_Basic(A,B,str));
1540   }
1541   PetscFunctionReturn(0);
1542 }
1543 
1544 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1545 {
1546   PetscFunctionBegin;
1547   PetscCall(MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,NULL));
1548   PetscFunctionReturn(0);
1549 }
1550 
1551 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1552 {
1553   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1554 
1555   PetscFunctionBegin;
1556   *array = a->val;
1557   PetscFunctionReturn(0);
1558 }
1559 
1560 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1561 {
1562   PetscFunctionBegin;
1563   PetscFunctionReturn(0);
1564 }
1565 
1566 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1567 {
1568   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1569   PetscInt    i;
1570   MatScalar   *aval=a->val;
1571 
1572   PetscFunctionBegin;
1573   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1574   PetscFunctionReturn(0);
1575 }
1576 
1577 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1578 {
1579   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1580   PetscInt       i;
1581   MatScalar      *aval=a->val;
1582 
1583   PetscFunctionBegin;
1584   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1585   PetscCall(MatSeqSELLInvalidateDiagonal(A));
1586   PetscFunctionReturn(0);
1587 }
1588 
1589 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1590 {
1591   Mat_SeqSELL    *a=(Mat_SeqSELL*)inA->data;
1592   MatScalar      *aval=a->val;
1593   PetscScalar    oalpha=alpha;
1594   PetscBLASInt   one=1,size;
1595 
1596   PetscFunctionBegin;
1597   PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices],&size));
1598   PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1599   PetscCall(PetscLogFlops(a->nz));
1600   PetscCall(MatSeqSELLInvalidateDiagonal(inA));
1601   PetscFunctionReturn(0);
1602 }
1603 
1604 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1605 {
1606   Mat_SeqSELL    *y=(Mat_SeqSELL*)Y->data;
1607 
1608   PetscFunctionBegin;
1609   if (!Y->preallocated || !y->nz) {
1610     PetscCall(MatSeqSELLSetPreallocation(Y,1,NULL));
1611   }
1612   PetscCall(MatShift_Basic(Y,a));
1613   PetscFunctionReturn(0);
1614 }
1615 
1616 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1617 {
1618   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1619   PetscScalar       *x,sum,*t;
1620   const MatScalar   *idiag=NULL,*mdiag;
1621   const PetscScalar *b,*xb;
1622   PetscInt          n,m=A->rmap->n,i,j,shift;
1623   const PetscInt    *diag;
1624 
1625   PetscFunctionBegin;
1626   its = its*lits;
1627 
1628   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1629   if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A,omega,fshift));
1630   a->fshift = fshift;
1631   a->omega  = omega;
1632 
1633   diag  = a->diag;
1634   t     = a->ssor_work;
1635   idiag = a->idiag;
1636   mdiag = a->mdiag;
1637 
1638   PetscCall(VecGetArray(xx,&x));
1639   PetscCall(VecGetArrayRead(bb,&b));
1640   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1641   PetscCheck(flag != SOR_APPLY_UPPER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented");
1642   PetscCheck(flag != SOR_APPLY_LOWER,PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented");
1643   PetscCheckFalse(flag & SOR_EISENSTAT,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
1644 
1645   if (flag & SOR_ZERO_INITIAL_GUESS) {
1646     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1647       for (i=0; i<m; i++) {
1648         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1649         sum   = b[i];
1650         n     = (diag[i]-shift)/8;
1651         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1652         t[i]  = sum;
1653         x[i]  = sum*idiag[i];
1654       }
1655       xb   = t;
1656       PetscCall(PetscLogFlops(a->nz));
1657     } else xb = b;
1658     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1659       for (i=m-1; i>=0; i--) {
1660         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1661         sum   = xb[i];
1662         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1663         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1664         if (xb == b) {
1665           x[i] = sum*idiag[i];
1666         } else {
1667           x[i] = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1668         }
1669       }
1670       PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1671     }
1672     its--;
1673   }
1674   while (its--) {
1675     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1676       for (i=0; i<m; i++) {
1677         /* lower */
1678         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1679         sum   = b[i];
1680         n     = (diag[i]-shift)/8;
1681         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1682         t[i]  = sum;             /* save application of the lower-triangular part */
1683         /* upper */
1684         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1685         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1686         x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1687       }
1688       xb   = t;
1689       PetscCall(PetscLogFlops(2.0*a->nz));
1690     } else xb = b;
1691     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1692       for (i=m-1; i>=0; i--) {
1693         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1694         sum = xb[i];
1695         if (xb == b) {
1696           /* whole matrix (no checkpointing available) */
1697           n     = a->rlen[i];
1698           for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1699           x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1700         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1701           n     = a->rlen[i]-(diag[i]-shift)/8-1;
1702           for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1703           x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1704         }
1705       }
1706       if (xb == b) {
1707         PetscCall(PetscLogFlops(2.0*a->nz));
1708       } else {
1709         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */
1710       }
1711     }
1712   }
1713   PetscCall(VecRestoreArray(xx,&x));
1714   PetscCall(VecRestoreArrayRead(bb,&b));
1715   PetscFunctionReturn(0);
1716 }
1717 
1718 /* -------------------------------------------------------------------*/
1719 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1720                                        MatGetRow_SeqSELL,
1721                                        MatRestoreRow_SeqSELL,
1722                                        MatMult_SeqSELL,
1723                                /* 4*/  MatMultAdd_SeqSELL,
1724                                        MatMultTranspose_SeqSELL,
1725                                        MatMultTransposeAdd_SeqSELL,
1726                                        NULL,
1727                                        NULL,
1728                                        NULL,
1729                                /* 10*/ NULL,
1730                                        NULL,
1731                                        NULL,
1732                                        MatSOR_SeqSELL,
1733                                        NULL,
1734                                /* 15*/ MatGetInfo_SeqSELL,
1735                                        MatEqual_SeqSELL,
1736                                        MatGetDiagonal_SeqSELL,
1737                                        MatDiagonalScale_SeqSELL,
1738                                        NULL,
1739                                /* 20*/ NULL,
1740                                        MatAssemblyEnd_SeqSELL,
1741                                        MatSetOption_SeqSELL,
1742                                        MatZeroEntries_SeqSELL,
1743                                /* 24*/ NULL,
1744                                        NULL,
1745                                        NULL,
1746                                        NULL,
1747                                        NULL,
1748                                /* 29*/ MatSetUp_SeqSELL,
1749                                        NULL,
1750                                        NULL,
1751                                        NULL,
1752                                        NULL,
1753                                /* 34*/ MatDuplicate_SeqSELL,
1754                                        NULL,
1755                                        NULL,
1756                                        NULL,
1757                                        NULL,
1758                                /* 39*/ NULL,
1759                                        NULL,
1760                                        NULL,
1761                                        MatGetValues_SeqSELL,
1762                                        MatCopy_SeqSELL,
1763                                /* 44*/ NULL,
1764                                        MatScale_SeqSELL,
1765                                        MatShift_SeqSELL,
1766                                        NULL,
1767                                        NULL,
1768                                /* 49*/ NULL,
1769                                        NULL,
1770                                        NULL,
1771                                        NULL,
1772                                        NULL,
1773                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
1774                                        NULL,
1775                                        NULL,
1776                                        NULL,
1777                                        NULL,
1778                                /* 59*/ NULL,
1779                                        MatDestroy_SeqSELL,
1780                                        MatView_SeqSELL,
1781                                        NULL,
1782                                        NULL,
1783                                /* 64*/ NULL,
1784                                        NULL,
1785                                        NULL,
1786                                        NULL,
1787                                        NULL,
1788                                /* 69*/ NULL,
1789                                        NULL,
1790                                        NULL,
1791                                        NULL,
1792                                        NULL,
1793                                /* 74*/ NULL,
1794                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1795                                        NULL,
1796                                        NULL,
1797                                        NULL,
1798                                /* 79*/ NULL,
1799                                        NULL,
1800                                        NULL,
1801                                        NULL,
1802                                        NULL,
1803                                /* 84*/ NULL,
1804                                        NULL,
1805                                        NULL,
1806                                        NULL,
1807                                        NULL,
1808                                /* 89*/ NULL,
1809                                        NULL,
1810                                        NULL,
1811                                        NULL,
1812                                        NULL,
1813                                /* 94*/ NULL,
1814                                        NULL,
1815                                        NULL,
1816                                        NULL,
1817                                        NULL,
1818                                /* 99*/ NULL,
1819                                        NULL,
1820                                        NULL,
1821                                        MatConjugate_SeqSELL,
1822                                        NULL,
1823                                /*104*/ NULL,
1824                                        NULL,
1825                                        NULL,
1826                                        NULL,
1827                                        NULL,
1828                                /*109*/ NULL,
1829                                        NULL,
1830                                        NULL,
1831                                        NULL,
1832                                        MatMissingDiagonal_SeqSELL,
1833                                /*114*/ NULL,
1834                                        NULL,
1835                                        NULL,
1836                                        NULL,
1837                                        NULL,
1838                                /*119*/ NULL,
1839                                        NULL,
1840                                        NULL,
1841                                        NULL,
1842                                        NULL,
1843                                /*124*/ NULL,
1844                                        NULL,
1845                                        NULL,
1846                                        NULL,
1847                                        NULL,
1848                                /*129*/ NULL,
1849                                        NULL,
1850                                        NULL,
1851                                        NULL,
1852                                        NULL,
1853                                /*134*/ NULL,
1854                                        NULL,
1855                                        NULL,
1856                                        NULL,
1857                                        NULL,
1858                                /*139*/ NULL,
1859                                        NULL,
1860                                        NULL,
1861                                        MatFDColoringSetUp_SeqXAIJ,
1862                                        NULL,
1863                                /*144*/ NULL,
1864                                        NULL,
1865                                        NULL,
1866                                        NULL
1867 };
1868 
1869 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1870 {
1871   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1872 
1873   PetscFunctionBegin;
1874   PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1875 
1876   /* allocate space for values if not already there */
1877   if (!a->saved_values) {
1878     PetscCall(PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values));
1879     PetscCall(PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar)));
1880   }
1881 
1882   /* copy values over */
1883   PetscCall(PetscArraycpy(a->saved_values,a->val,a->sliidx[a->totalslices]));
1884   PetscFunctionReturn(0);
1885 }
1886 
1887 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1888 {
1889   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1890 
1891   PetscFunctionBegin;
1892   PetscCheck(a->nonew,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1893   PetscCheck(a->saved_values,PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first");
1894   PetscCall(PetscArraycpy(a->val,a->saved_values,a->sliidx[a->totalslices]));
1895   PetscFunctionReturn(0);
1896 }
1897 
1898 /*@C
1899  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1900 
1901  Not Collective
1902 
1903  Input Parameters:
1904  .  mat - a MATSEQSELL matrix
1905  .  array - pointer to the data
1906 
1907  Level: intermediate
1908 
1909  .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90()
1910  @*/
1911 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1912 {
1913   PetscFunctionBegin;
1914   PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));
1915   PetscFunctionReturn(0);
1916 }
1917 
1918 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1919 {
1920   Mat_SeqSELL    *b;
1921   PetscMPIInt    size;
1922 
1923   PetscFunctionBegin;
1924   PetscCall(PetscCitationsRegister(citation,&cited));
1925   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size));
1926   PetscCheck(size <= 1,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1");
1927 
1928   PetscCall(PetscNewLog(B,&b));
1929 
1930   B->data = (void*)b;
1931 
1932   PetscCall(PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps)));
1933 
1934   b->row                = NULL;
1935   b->col                = NULL;
1936   b->icol               = NULL;
1937   b->reallocs           = 0;
1938   b->ignorezeroentries  = PETSC_FALSE;
1939   b->roworiented        = PETSC_TRUE;
1940   b->nonew              = 0;
1941   b->diag               = NULL;
1942   b->solve_work         = NULL;
1943   B->spptr              = NULL;
1944   b->saved_values       = NULL;
1945   b->idiag              = NULL;
1946   b->mdiag              = NULL;
1947   b->ssor_work          = NULL;
1948   b->omega              = 1.0;
1949   b->fshift             = 0.0;
1950   b->idiagvalid         = PETSC_FALSE;
1951   b->keepnonzeropattern = PETSC_FALSE;
1952 
1953   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL));
1954   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL));
1955   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL));
1956   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL));
1957   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL));
1958   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL));
1959   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ));
1960   PetscFunctionReturn(0);
1961 }
1962 
1963 /*
1964  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1965  */
1966 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
1967 {
1968   Mat_SeqSELL    *c = (Mat_SeqSELL*)C->data,*a = (Mat_SeqSELL*)A->data;
1969   PetscInt       i,m=A->rmap->n;
1970   PetscInt       totalslices=a->totalslices;
1971 
1972   PetscFunctionBegin;
1973   C->factortype = A->factortype;
1974   c->row        = NULL;
1975   c->col        = NULL;
1976   c->icol       = NULL;
1977   c->reallocs   = 0;
1978   C->assembled = PETSC_TRUE;
1979 
1980   PetscCall(PetscLayoutReference(A->rmap,&C->rmap));
1981   PetscCall(PetscLayoutReference(A->cmap,&C->cmap));
1982 
1983   PetscCall(PetscMalloc1(8*totalslices,&c->rlen));
1984   PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt)));
1985   PetscCall(PetscMalloc1(totalslices+1,&c->sliidx));
1986   PetscCall(PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt)));
1987 
1988   for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
1989   for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];
1990 
1991   /* allocate the matrix space */
1992   if (mallocmatspace) {
1993     PetscCall(PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx));
1994     PetscCall(PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt))));
1995 
1996     c->singlemalloc = PETSC_TRUE;
1997 
1998     if (m > 0) {
1999       PetscCall(PetscArraycpy(c->colidx,a->colidx,a->maxallocmat));
2000       if (cpvalues == MAT_COPY_VALUES) {
2001         PetscCall(PetscArraycpy(c->val,a->val,a->maxallocmat));
2002       } else {
2003         PetscCall(PetscArrayzero(c->val,a->maxallocmat));
2004       }
2005     }
2006   }
2007 
2008   c->ignorezeroentries = a->ignorezeroentries;
2009   c->roworiented       = a->roworiented;
2010   c->nonew             = a->nonew;
2011   if (a->diag) {
2012     PetscCall(PetscMalloc1(m,&c->diag));
2013     PetscCall(PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt)));
2014     for (i=0; i<m; i++) {
2015       c->diag[i] = a->diag[i];
2016     }
2017   } else c->diag = NULL;
2018 
2019   c->solve_work         = NULL;
2020   c->saved_values       = NULL;
2021   c->idiag              = NULL;
2022   c->ssor_work          = NULL;
2023   c->keepnonzeropattern = a->keepnonzeropattern;
2024   c->free_val           = PETSC_TRUE;
2025   c->free_colidx        = PETSC_TRUE;
2026 
2027   c->maxallocmat  = a->maxallocmat;
2028   c->maxallocrow  = a->maxallocrow;
2029   c->rlenmax      = a->rlenmax;
2030   c->nz           = a->nz;
2031   C->preallocated = PETSC_TRUE;
2032 
2033   c->nonzerorowcnt = a->nonzerorowcnt;
2034   C->nonzerostate  = A->nonzerostate;
2035 
2036   PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist));
2037   PetscFunctionReturn(0);
2038 }
2039 
2040 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2041 {
2042   PetscFunctionBegin;
2043   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
2044   PetscCall(MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n));
2045   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2046     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
2047   }
2048   PetscCall(MatSetType(*B,((PetscObject)A)->type_name));
2049   PetscCall(MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE));
2050   PetscFunctionReturn(0);
2051 }
2052 
2053 /*MC
2054    MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices,
2055    based on the sliced Ellpack format
2056 
2057    Options Database Keys:
2058 . -mat_type seqsell - sets the matrix type to "seqsell" during a call to MatSetFromOptions()
2059 
2060    Level: beginner
2061 
2062 .seealso: MatCreateSeqSell(), MATSELL, MATMPISELL, MATSEQAIJ, MATAIJ, MATMPIAIJ
2063 M*/
2064 
2065 /*MC
2066    MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices.
2067 
2068    This matrix type is identical to MATSEQSELL when constructed with a single process communicator,
2069    and MATMPISELL otherwise.  As a result, for single process communicators,
2070   MatSeqSELLSetPreallocation() is supported, and similarly MatMPISELLSetPreallocation() is supported
2071   for communicators controlling multiple processes.  It is recommended that you call both of
2072   the above preallocation routines for simplicity.
2073 
2074    Options Database Keys:
2075 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions()
2076 
2077   Level: beginner
2078 
2079   Notes:
2080    This format is only supported for real scalars, double precision, and 32 bit indices (the defaults).
2081 
2082    It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of
2083    non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement.
2084 
2085   Developer Notes:
2086    On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance.
2087 
2088    The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8
2089 .vb
2090                             (2 0  3 4)
2091    Consider the matrix A =  (5 0  6 0)
2092                             (0 0  7 8)
2093                             (0 0  9 9)
2094 
2095    symbolically the Ellpack format can be written as
2096 
2097         (2 3 4 |)           (0 2 3 |)
2098    v =  (5 6 0 |)  colidx = (0 2 2 |)
2099         --------            ---------
2100         (7 8 |)             (2 3 |)
2101         (9 9 |)             (2 3 |)
2102 
2103     The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case).
2104     Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and
2105     zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice.
2106 
2107     The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9)  and for colidx is (0 0 2 2 3 2 2 2 3 3)
2108 
2109 .ve
2110 
2111       See MatMult_SeqSELL() for how this format is used with the SIMD operations to achieve high performance.
2112 
2113  References:
2114 . * - Hong Zhang, Richard T. Mills, Karl Rupp, and Barry F. Smith, Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512},
2115    Proceedings of the 47th International Conference on Parallel Processing, 2018.
2116 
2117 .seealso: MatCreateSeqSELL(), MatCreateSeqAIJ(), MatCreateSell(), MATSEQSELL, MATMPISELL, MATSEQAIJ, MATMPIAIJ, MATAIJ
2118 M*/
2119 
2120 /*@C
2121        MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2122 
2123  Collective on comm
2124 
2125  Input Parameters:
2126 +  comm - MPI communicator, set to PETSC_COMM_SELF
2127 .  m - number of rows
2128 .  n - number of columns
2129 .  rlenmax - maximum number of nonzeros in a row
2130 -  rlen - array containing the number of nonzeros in the various rows
2131  (possibly different for each row) or NULL
2132 
2133  Output Parameter:
2134 .  A - the matrix
2135 
2136  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2137  MatXXXXSetPreallocation() paradigm instead of this routine directly.
2138  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2139 
2140  Notes:
2141  If nnz is given then nz is ignored
2142 
2143  Specify the preallocated storage with either rlenmax or rlen (not both).
2144  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2145  allocation.  For large problems you MUST preallocate memory or you
2146  will get TERRIBLE performance, see the users' manual chapter on matrices.
2147 
2148  Level: intermediate
2149 
2150  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatSeqSELLSetPreallocation(), MATSELL, MATSEQSELL, MATMPISELL
2151 
2152  @*/
2153 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2154 {
2155   PetscFunctionBegin;
2156   PetscCall(MatCreate(comm,A));
2157   PetscCall(MatSetSizes(*A,m,n,m,n));
2158   PetscCall(MatSetType(*A,MATSEQSELL));
2159   PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen));
2160   PetscFunctionReturn(0);
2161 }
2162 
2163 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2164 {
2165   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2166   PetscInt       totalslices=a->totalslices;
2167 
2168   PetscFunctionBegin;
2169   /* If the  matrix dimensions are not equal,or no of nonzeros */
2170   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2171     *flg = PETSC_FALSE;
2172     PetscFunctionReturn(0);
2173   }
2174   /* if the a->colidx are the same */
2175   PetscCall(PetscArraycmp(a->colidx,b->colidx,a->sliidx[totalslices],flg));
2176   if (!*flg) PetscFunctionReturn(0);
2177   /* if a->val are the same */
2178   PetscCall(PetscArraycmp(a->val,b->val,a->sliidx[totalslices],flg));
2179   PetscFunctionReturn(0);
2180 }
2181 
2182 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2183 {
2184   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2185 
2186   PetscFunctionBegin;
2187   a->idiagvalid  = PETSC_FALSE;
2188   PetscFunctionReturn(0);
2189 }
2190 
2191 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2192 {
2193 #if defined(PETSC_USE_COMPLEX)
2194   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2195   PetscInt    i;
2196   PetscScalar *val = a->val;
2197 
2198   PetscFunctionBegin;
2199   for (i=0; i<a->sliidx[a->totalslices]; i++) {
2200     val[i] = PetscConj(val[i]);
2201   }
2202 #else
2203   PetscFunctionBegin;
2204 #endif
2205   PetscFunctionReturn(0);
2206 }
2207