xref: /petsc/src/mat/impls/sell/seq/sell.c (revision 68aafef34f4f37251d20f2f7b9dc5fe912d64cb5)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h>  /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 #if defined(PETSC_HAVE_IMMINTRIN_H)
9   #include <immintrin.h>
10 
11   #if !defined(_MM_SCALE_8)
12   #define _MM_SCALE_8    8
13   #endif
14 
15   #if defined(__AVX512F__)
16   /* these do not work
17    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
18    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
19   */
20     #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
21     /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
22     vec_idx  = _mm256_load_si256((__m256i const*)acolidx); \
23     vec_vals = _mm512_load_pd(aval); \
24     vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
25     vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
26   #elif defined(__AVX2__)
27     #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
28     vec_vals = _mm256_load_pd(aval); \
29     vec_idx  =  _mm_load_si128((__m128i const*)acolidx); /* SSE2 */ \
30     vec_x    = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
31     vec_y    = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
32   #endif
33 #endif  /* PETSC_HAVE_IMMINTRIN_H */
34 
35 /*@C
36  MatSeqSELLSetPreallocation - For good matrix assembly performance
37  the user should preallocate the matrix storage by setting the parameter nz
38  (or the array nnz).  By setting these parameters accurately, performance
39  during matrix assembly can be increased significantly.
40 
41  Collective on MPI_Comm
42 
43  Input Parameters:
44  +  B - The matrix
45  .  nz - number of nonzeros per row (same for all rows)
46  -  nnz - array containing the number of nonzeros in the various rows
47  (possibly different for each row) or NULL
48 
49  Notes:
50  If nnz is given then nz is ignored.
51 
52  Specify the preallocated storage with either nz or nnz (not both).
53  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
54  allocation.  For large problems you MUST preallocate memory or you
55  will get TERRIBLE performance, see the users' manual chapter on matrices.
56 
57  You can call MatGetInfo() to get information on how effective the preallocation was;
58  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
59  You can also run with the option -info and look for messages with the string
60  malloc in them to see if additional memory allocation was needed.
61 
62  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
63  entries or columns indices.
64 
65  The maximum number of nonzeos in any row should be as accuate as possible.
66  If it is underesitmated, you will get bad performance due to reallocation
67  (MatSeqXSELLReallocateSELL).
68 
69  Level: intermediate
70 
71  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo()
72 
73  @*/
74 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
75 {
76   PetscErrorCode ierr;
77 
78   PetscFunctionBegin;
79   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
80   PetscValidType(B,1);
81   ierr = PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));CHKERRQ(ierr);
82   PetscFunctionReturn(0);
83 }
84 
85 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
86 {
87   Mat_SeqSELL    *b;
88   PetscInt       i,j,totalslices;
89   PetscBool      skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;
90   PetscErrorCode ierr;
91 
92   PetscFunctionBegin;
93   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
94   if (maxallocrow == MAT_SKIP_ALLOCATION) {
95     skipallocation = PETSC_TRUE;
96     maxallocrow    = 0;
97   }
98 
99   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
100   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
101 
102   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
103   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
104   if (maxallocrow < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %D",maxallocrow);
105   if (rlen) {
106     for (i=0; i<B->rmap->n; i++) {
107       if (rlen[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %D value %D",i,rlen[i]);
108       if (rlen[i] > B->cmap->n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %D value %D rowlength %D",i,rlen[i],B->cmap->n);
109     }
110   }
111 
112   B->preallocated = PETSC_TRUE;
113 
114   b = (Mat_SeqSELL*)B->data;
115 
116   totalslices = B->rmap->n/8+((B->rmap->n & 0x07)?1:0); /* ceil(n/8) */
117   b->totalslices = totalslices;
118   if (!skipallocation) {
119     if (B->rmap->n & 0x07) PetscInfo1(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %D)\n",B->rmap->n);
120 
121     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
122       ierr = PetscMalloc1(totalslices+1,&b->sliidx);CHKERRQ(ierr);
123       ierr = PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
124     }
125     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
126       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
127       else if (maxallocrow < 0) maxallocrow = 1;
128       for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
129     } else {
130       maxallocrow = 0;
131       b->sliidx[0] = 0;
132       for (i=1; i<totalslices; i++) {
133         b->sliidx[i] = 0;
134         for (j=0;j<8;j++) {
135           b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
136         }
137         maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
138         b->sliidx[i] = b->sliidx[i-1] + 8*b->sliidx[i];
139       }
140       /* last slice */
141       b->sliidx[totalslices] = 0;
142       for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
143       maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
144       b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
145     }
146 
147     /* allocate space for val, colidx, rlen */
148     /* FIXME: should B's old memory be unlogged? */
149     ierr = MatSeqXSELLFreeSELL(B,&b->val,&b->colidx);CHKERRQ(ierr);
150     /* FIXME: assuming an element of the bit array takes 8 bits */
151     ierr = PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx);CHKERRQ(ierr);
152     ierr = PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
153     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
154     ierr = PetscCalloc1(8*totalslices,&b->rlen);CHKERRQ(ierr);
155     ierr = PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt));CHKERRQ(ierr);
156 
157     b->singlemalloc = PETSC_TRUE;
158     b->free_val     = PETSC_TRUE;
159     b->free_colidx  = PETSC_TRUE;
160   } else {
161     b->free_val    = PETSC_FALSE;
162     b->free_colidx = PETSC_FALSE;
163   }
164 
165   b->nz               = 0;
166   b->maxallocrow      = maxallocrow;
167   b->rlenmax          = maxallocrow;
168   b->maxallocmat      = b->sliidx[totalslices];
169   B->info.nz_unneeded = (double)b->maxallocmat;
170   if (realalloc) {
171     ierr = MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
172   }
173   PetscFunctionReturn(0);
174 }
175 
176 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
177 {
178   Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
179   PetscInt    shift;
180 
181   PetscFunctionBegin;
182   if (row < 0 || row >= A->rmap->n) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %D out of range",row);
183   if (nz) *nz = a->rlen[row];
184   shift = a->sliidx[row>>3]+(row&0x07);
185   if (!a->getrowcols) {
186     PetscErrorCode ierr;
187 
188     ierr = PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals);CHKERRQ(ierr);
189   }
190   if (idx) {
191     PetscInt j;
192     for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
193     *idx = a->getrowcols;
194   }
195   if (v) {
196     PetscInt j;
197     for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
198     *v = a->getrowvals;
199   }
200   PetscFunctionReturn(0);
201 }
202 
203 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
204 {
205   PetscFunctionBegin;
206   PetscFunctionReturn(0);
207 }
208 
209 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
210 {
211   Mat            B;
212   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
213   PetscInt       i;
214   PetscErrorCode ierr;
215 
216   PetscFunctionBegin;
217   if (reuse == MAT_REUSE_MATRIX) {
218     B    = *newmat;
219     ierr = MatZeroEntries(B);CHKERRQ(ierr);
220   } else {
221     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
222     ierr = MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N);CHKERRQ(ierr);
223     ierr = MatSetType(B,MATSEQAIJ);CHKERRQ(ierr);
224     ierr = MatSeqAIJSetPreallocation(B,0,a->rlen);CHKERRQ(ierr);
225   }
226 
227   for (i=0; i<A->rmap->n; i++) {
228     PetscInt    nz,*cols;
229     PetscScalar *vals;
230 
231     ierr = MatGetRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
232     ierr = MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
233     ierr = MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
234   }
235 
236   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
237   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
238   B->rmap->bs = A->rmap->bs;
239 
240   if (reuse == MAT_INPLACE_MATRIX) {
241     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
242   } else {
243     *newmat = B;
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 #include <../src/mat/impls/aij/seq/aij.h>
249 
250 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
251 {
252   Mat               B;
253   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
254   PetscInt          *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
255   const PetscInt    *cols;
256   const PetscScalar *vals;
257   PetscErrorCode    ierr;
258 
259   PetscFunctionBegin;
260   if (A->rmap->bs > 1) {
261     ierr = MatConvert_Basic(A,newtype,reuse,newmat);CHKERRQ(ierr);
262     PetscFunctionReturn(0);
263   }
264 
265   if (reuse == MAT_REUSE_MATRIX) {
266     B = *newmat;
267   } else {
268     /* Can we just use ilen? */
269     ierr = PetscMalloc1(m,&rowlengths);CHKERRQ(ierr);
270     for (i=0; i<m; i++) {
271       rowlengths[i] = ai[i+1] - ai[i];
272     }
273 
274     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
275     ierr = MatSetSizes(B,m,n,m,n);CHKERRQ(ierr);
276     ierr = MatSetType(B,MATSEQSELL);CHKERRQ(ierr);
277     ierr = MatSeqSELLSetPreallocation(B,0,rowlengths);CHKERRQ(ierr);
278     ierr = PetscFree(rowlengths);CHKERRQ(ierr);
279   }
280 
281   for (row=0; row<m; row++) {
282     ierr = MatGetRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
283     ierr = MatSetValues(B,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
284     ierr = MatRestoreRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
285   }
286   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
287   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
288   B->rmap->bs = A->rmap->bs;
289 
290   if (reuse == MAT_INPLACE_MATRIX) {
291     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
292   } else {
293     *newmat = B;
294   }
295   PetscFunctionReturn(0);
296 }
297 
298 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
299 {
300   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
301   PetscScalar       *y;
302   const PetscScalar *x;
303   const MatScalar   *aval=a->val;
304   PetscInt          totalslices=a->totalslices;
305   const PetscInt    *acolidx=a->colidx;
306   PetscInt          i,j;
307   PetscErrorCode    ierr;
308 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
309   __m512d           vec_x,vec_y,vec_vals;
310   __m256i           vec_idx;
311   __mmask8          mask;
312   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
313   __m256i           vec_idx2,vec_idx3,vec_idx4;
314 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
315   __m128i           vec_idx;
316   __m256d           vec_x,vec_y,vec_y2,vec_vals;
317   MatScalar         yval;
318   PetscInt          r,rows_left,row,nnz_in_row;
319 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
320   __m128d           vec_x_tmp;
321   __m256d           vec_x,vec_y,vec_y2,vec_vals;
322   MatScalar         yval;
323   PetscInt          r,rows_left,row,nnz_in_row;
324 #else
325   PetscScalar       sum[8];
326 #endif
327 
328 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
329 #pragma disjoint(*x,*y,*aval)
330 #endif
331 
332   PetscFunctionBegin;
333   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
334   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
335 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
336   for (i=0; i<totalslices; i++) { /* loop over slices */
337     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
338     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
339 
340     vec_y  = _mm512_setzero_pd();
341     vec_y2 = _mm512_setzero_pd();
342     vec_y3 = _mm512_setzero_pd();
343     vec_y4 = _mm512_setzero_pd();
344 
345     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
346     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
347     case 3:
348       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
349       acolidx += 8; aval += 8;
350       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
351       acolidx += 8; aval += 8;
352       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
353       acolidx += 8; aval += 8;
354       j += 3;
355       break;
356     case 2:
357       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
358       acolidx += 8; aval += 8;
359       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
360       acolidx += 8; aval += 8;
361       j += 2;
362       break;
363     case 1:
364       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
365       acolidx += 8; aval += 8;
366       j += 1;
367       break;
368     }
369     #pragma novector
370     for (; j<(a->sliidx[i+1]>>3); j+=4) {
371       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
372       acolidx += 8; aval += 8;
373       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
374       acolidx += 8; aval += 8;
375       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
376       acolidx += 8; aval += 8;
377       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
378       acolidx += 8; aval += 8;
379     }
380 
381     vec_y = _mm512_add_pd(vec_y,vec_y2);
382     vec_y = _mm512_add_pd(vec_y,vec_y3);
383     vec_y = _mm512_add_pd(vec_y,vec_y4);
384     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
385       mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
386       _mm512_mask_store_pd(&y[8*i],mask,vec_y);
387     } else {
388       _mm512_store_pd(&y[8*i],vec_y);
389     }
390   }
391 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
392   for (i=0; i<totalslices; i++) { /* loop over full slices */
393     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
394     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
395 
396     /* last slice may have padding rows. Don't use vectorization. */
397     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
398       rows_left = A->rmap->n - 8*i;
399       for (r=0; r<rows_left; ++r) {
400         yval = (MatScalar)0;
401         row = 8*i + r;
402         nnz_in_row = a->rlen[row];
403         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
404         y[row] = yval;
405       }
406       break;
407     }
408 
409     vec_y  = _mm256_setzero_pd();
410     vec_y2 = _mm256_setzero_pd();
411 
412     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
413     #pragma novector
414     #pragma unroll(2)
415     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
416       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
417       aval += 4; acolidx += 4;
418       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
419       aval += 4; acolidx += 4;
420     }
421 
422     _mm256_store_pd(y+i*8,vec_y);
423     _mm256_store_pd(y+i*8+4,vec_y2);
424   }
425 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
426   for (i=0; i<totalslices; i++) { /* loop over full slices */
427     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
428     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
429 
430     vec_y  = _mm256_setzero_pd();
431     vec_y2 = _mm256_setzero_pd();
432 
433     /* last slice may have padding rows. Don't use vectorization. */
434     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
435       rows_left = A->rmap->n - 8*i;
436       for (r=0; r<rows_left; ++r) {
437         yval = (MatScalar)0;
438         row = 8*i + r;
439         nnz_in_row = a->rlen[row];
440         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
441         y[row] = yval;
442       }
443       break;
444     }
445 
446     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
447     #pragma novector
448     #pragma unroll(2)
449     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
450       vec_vals  = _mm256_loadu_pd(aval);
451       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
452       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
453       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
454       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
455       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
456       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
457       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
458       aval     += 4;
459 
460       vec_vals  = _mm256_loadu_pd(aval);
461       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
462       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
463       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
464       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
465       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
466       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
467       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
468       aval     += 4;
469     }
470 
471     _mm256_storeu_pd(y + i*8,     vec_y);
472     _mm256_storeu_pd(y + i*8 + 4, vec_y2);
473   }
474 #else
475   for (i=0; i<totalslices; i++) { /* loop over slices */
476     for (j=0; j<8; j++) sum[j] = 0.0;
477     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
478       sum[0] += aval[j] * x[acolidx[j]];
479       sum[1] += aval[j+1] * x[acolidx[j+1]];
480       sum[2] += aval[j+2] * x[acolidx[j+2]];
481       sum[3] += aval[j+3] * x[acolidx[j+3]];
482       sum[4] += aval[j+4] * x[acolidx[j+4]];
483       sum[5] += aval[j+5] * x[acolidx[j+5]];
484       sum[6] += aval[j+6] * x[acolidx[j+6]];
485       sum[7] += aval[j+7] * x[acolidx[j+7]];
486     }
487     if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
488       for(j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
489     } else {
490       for(j=0; j<8; j++) y[8*i+j] = sum[j];
491     }
492   }
493 #endif
494 
495   ierr = PetscLogFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); /* theoretical minimal FLOPs */
496   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
497   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
498   PetscFunctionReturn(0);
499 }
500 
501 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
502 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
503 {
504   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
505   PetscScalar       *y,*z;
506   const PetscScalar *x;
507   const MatScalar   *aval=a->val;
508   PetscInt          totalslices=a->totalslices;
509   const PetscInt    *acolidx=a->colidx;
510   PetscInt          i,j;
511   PetscErrorCode    ierr;
512 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
513   __m512d           vec_x,vec_y,vec_vals;
514   __m256i           vec_idx;
515   __mmask8          mask;
516   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
517   __m256i           vec_idx2,vec_idx3,vec_idx4;
518 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
519   __m128d           vec_x_tmp;
520   __m256d           vec_x,vec_y,vec_y2,vec_vals;
521   MatScalar         yval;
522   PetscInt          r,row,nnz_in_row;
523 #else
524   PetscScalar       sum[8];
525 #endif
526 
527 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
528 #pragma disjoint(*x,*y,*aval)
529 #endif
530 
531   PetscFunctionBegin;
532   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
533   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
534 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
535   for (i=0; i<totalslices; i++) { /* loop over slices */
536     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
537     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
538 
539     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
540       mask   = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
541       vec_y  = _mm512_mask_load_pd(vec_y,mask,&y[8*i]);
542     } else {
543       vec_y  = _mm512_load_pd(&y[8*i]);
544     }
545     vec_y2 = _mm512_setzero_pd();
546     vec_y3 = _mm512_setzero_pd();
547     vec_y4 = _mm512_setzero_pd();
548 
549     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
550     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
551     case 3:
552       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
553       acolidx += 8; aval += 8;
554       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
555       acolidx += 8; aval += 8;
556       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
557       acolidx += 8; aval += 8;
558       j += 3;
559       break;
560     case 2:
561       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
562       acolidx += 8; aval += 8;
563       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
564       acolidx += 8; aval += 8;
565       j += 2;
566       break;
567     case 1:
568       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
569       acolidx += 8; aval += 8;
570       j += 1;
571       break;
572     }
573     #pragma novector
574     for (; j<(a->sliidx[i+1]>>3); j+=4) {
575       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
576       acolidx += 8; aval += 8;
577       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
578       acolidx += 8; aval += 8;
579       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
580       acolidx += 8; aval += 8;
581       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
582       acolidx += 8; aval += 8;
583     }
584 
585     vec_y = _mm512_add_pd(vec_y,vec_y2);
586     vec_y = _mm512_add_pd(vec_y,vec_y3);
587     vec_y = _mm512_add_pd(vec_y,vec_y4);
588     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
589       _mm512_mask_store_pd(&z[8*i],mask,vec_y);
590     } else {
591       _mm512_store_pd(&z[8*i],vec_y);
592     }
593   }
594 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
595   for (i=0; i<totalslices; i++) { /* loop over full slices */
596     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
597     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
598 
599     /* last slice may have padding rows. Don't use vectorization. */
600     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
601       for (r=0; r<(A->rmap->n & 0x07); ++r) {
602         row        = 8*i + r;
603         yval       = (MatScalar)0.0;
604         nnz_in_row = a->rlen[row];
605         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
606         z[row] = y[row] + yval;
607       }
608       break;
609     }
610 
611     vec_y  = _mm256_loadu_pd(y+8*i);
612     vec_y2 = _mm256_loadu_pd(y+8*i+4);
613 
614     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
615     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
616       vec_vals  = _mm256_loadu_pd(aval);
617       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
618       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
619       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
620       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
621       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
622       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
623       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
624       aval     += 4;
625 
626       vec_vals  = _mm256_loadu_pd(aval);
627       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
628       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
629       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
630       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
631       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
632       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
633       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
634       aval     += 4;
635     }
636 
637     _mm256_storeu_pd(z+i*8,vec_y);
638     _mm256_storeu_pd(z+i*8+4,vec_y2);
639   }
640 #else
641   for (i=0; i<totalslices; i++) { /* loop over slices */
642     for (j=0; j<8; j++) sum[j] = 0.0;
643     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
644       sum[0] += aval[j] * x[acolidx[j]];
645       sum[1] += aval[j+1] * x[acolidx[j+1]];
646       sum[2] += aval[j+2] * x[acolidx[j+2]];
647       sum[3] += aval[j+3] * x[acolidx[j+3]];
648       sum[4] += aval[j+4] * x[acolidx[j+4]];
649       sum[5] += aval[j+5] * x[acolidx[j+5]];
650       sum[6] += aval[j+6] * x[acolidx[j+6]];
651       sum[7] += aval[j+7] * x[acolidx[j+7]];
652     }
653     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
654       for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
655     } else {
656       for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
657     }
658   }
659 #endif
660 
661   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
662   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
663   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
664   PetscFunctionReturn(0);
665 }
666 
667 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
668 {
669   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
670   PetscScalar       *y;
671   const PetscScalar *x;
672   const MatScalar   *aval=a->val;
673   const PetscInt    *acolidx=a->colidx;
674   PetscInt          i,j,r,row,nnz_in_row,totalslices=a->totalslices;
675   PetscErrorCode    ierr;
676 
677 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
678 #pragma disjoint(*x,*y,*aval)
679 #endif
680 
681   PetscFunctionBegin;
682   if (zz != yy) { ierr = VecCopy(zz,yy);CHKERRQ(ierr); }
683   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
684   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
685   for (i=0; i<a->totalslices; i++) { /* loop over slices */
686     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
687       for (r=0; r<(A->rmap->n & 0x07); ++r) {
688         row        = 8*i + r;
689         nnz_in_row = a->rlen[row];
690         for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
691       }
692       break;
693     }
694     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
695       y[acolidx[j]]   += aval[j] * x[8*i];
696       y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
697       y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
698       y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
699       y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
700       y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
701       y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
702       y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
703     }
704   }
705   ierr = PetscLogFlops(2.0*a->sliidx[a->totalslices]);CHKERRQ(ierr);
706   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
707   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
708   PetscFunctionReturn(0);
709 }
710 
711 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
712 {
713   PetscErrorCode ierr;
714 
715   PetscFunctionBegin;
716   ierr = VecSet(yy,0.0);CHKERRQ(ierr);
717   ierr = MatMultTransposeAdd_SeqSELL(A,xx,yy,yy);CHKERRQ(ierr);
718   PetscFunctionReturn(0);
719 }
720 
721 /*
722      Checks for missing diagonals
723 */
724 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool  *missing,PetscInt *d)
725 {
726   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
727   PetscInt    *diag,i;
728 
729   PetscFunctionBegin;
730   *missing = PETSC_FALSE;
731   if (A->rmap->n > 0 && !(a->colidx)) {
732     *missing = PETSC_TRUE;
733     if (d) *d = 0;
734     PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n");
735   } else {
736     diag = a->diag;
737     for (i=0; i<A->rmap->n; i++) {
738       if (diag[i] == -1) {
739         *missing = PETSC_TRUE;
740         if (d) *d = i;
741         PetscInfo1(A,"Matrix is missing diagonal number %D\n",i);
742         break;
743       }
744     }
745   }
746   PetscFunctionReturn(0);
747 }
748 
749 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
750 {
751   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
752   PetscInt       i,j,m=A->rmap->n,shift;
753   PetscErrorCode ierr;
754 
755   PetscFunctionBegin;
756   if (!a->diag) {
757     ierr         = PetscMalloc1(m,&a->diag);CHKERRQ(ierr);
758     ierr         = PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt));CHKERRQ(ierr);
759     a->free_diag = PETSC_TRUE;
760   }
761   for (i=0; i<m; i++) { /* loop over rows */
762     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
763     a->diag[i] = -1;
764     for (j=0; j<a->rlen[i]; j++) {
765       if (a->colidx[shift+j*8] == i) {
766         a->diag[i] = shift+j*8;
767         break;
768       }
769     }
770   }
771   PetscFunctionReturn(0);
772 }
773 
774 /*
775   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
776 */
777 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
778 {
779   Mat_SeqSELL    *a=(Mat_SeqSELL*) A->data;
780   PetscInt       i,*diag,m = A->rmap->n;
781   MatScalar      *val = a->val;
782   PetscScalar    *idiag,*mdiag;
783   PetscErrorCode ierr;
784 
785   PetscFunctionBegin;
786   if (a->idiagvalid) PetscFunctionReturn(0);
787   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
788   diag = a->diag;
789   if (!a->idiag) {
790     ierr = PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work);CHKERRQ(ierr);
791     ierr = PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar));CHKERRQ(ierr);
792     val  = a->val;
793   }
794   mdiag = a->mdiag;
795   idiag = a->idiag;
796 
797   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
798     for (i=0; i<m; i++) {
799       mdiag[i] = val[diag[i]];
800       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
801         if (PetscRealPart(fshift)) {
802           ierr = PetscInfo1(A,"Zero diagonal on row %D\n",i);CHKERRQ(ierr);
803           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
804           A->factorerror_zeropivot_value = 0.0;
805           A->factorerror_zeropivot_row   = i;
806         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %D",i);
807       }
808       idiag[i] = 1.0/val[diag[i]];
809     }
810     ierr = PetscLogFlops(m);CHKERRQ(ierr);
811   } else {
812     for (i=0; i<m; i++) {
813       mdiag[i] = val[diag[i]];
814       idiag[i] = omega/(fshift + val[diag[i]]);
815     }
816     ierr = PetscLogFlops(2.0*m);CHKERRQ(ierr);
817   }
818   a->idiagvalid = PETSC_TRUE;
819   PetscFunctionReturn(0);
820 }
821 
822 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
823 {
824   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
825   PetscErrorCode ierr;
826 
827   PetscFunctionBegin;
828   ierr = PetscMemzero(a->val,(a->sliidx[a->totalslices])*sizeof(PetscScalar));CHKERRQ(ierr);
829   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
830   PetscFunctionReturn(0);
831 }
832 
833 PetscErrorCode MatDestroy_SeqSELL(Mat A)
834 {
835   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
836   PetscErrorCode ierr;
837 
838   PetscFunctionBegin;
839 #if defined(PETSC_USE_LOG)
840   PetscLogObjectState((PetscObject)A,"Rows=%D, Cols=%D, NZ=%D",A->rmap->n,A->cmap->n,a->nz);
841 #endif
842   ierr = MatSeqXSELLFreeSELL(A,&a->val,&a->colidx);CHKERRQ(ierr);
843   ierr = ISDestroy(&a->row);CHKERRQ(ierr);
844   ierr = ISDestroy(&a->col);CHKERRQ(ierr);
845   ierr = PetscFree(a->diag);CHKERRQ(ierr);
846   ierr = PetscFree(a->ibdiag);CHKERRQ(ierr);
847   ierr = PetscFree(a->rlen);CHKERRQ(ierr);
848   ierr = PetscFree(a->sliidx);CHKERRQ(ierr);
849   ierr = PetscFree3(a->idiag,a->mdiag,a->ssor_work);CHKERRQ(ierr);
850   ierr = PetscFree(a->solve_work);CHKERRQ(ierr);
851   ierr = ISDestroy(&a->icol);CHKERRQ(ierr);
852   ierr = PetscFree(a->saved_values);CHKERRQ(ierr);
853   ierr = PetscFree2(a->getrowcols,a->getrowvals);CHKERRQ(ierr);
854 
855   ierr = PetscFree(A->data);CHKERRQ(ierr);
856 
857   ierr = PetscObjectChangeTypeName((PetscObject)A,0);CHKERRQ(ierr);
858   ierr = PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL);CHKERRQ(ierr);
859   ierr = PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL);CHKERRQ(ierr);
860 #if defined(PETSC_HAVE_ELEMENTAL)
861 #endif
862   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL);CHKERRQ(ierr);
863   PetscFunctionReturn(0);
864 }
865 
866 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
867 {
868   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
869   PetscErrorCode ierr;
870 
871   PetscFunctionBegin;
872   switch (op) {
873   case MAT_ROW_ORIENTED:
874     a->roworiented = flg;
875     break;
876   case MAT_KEEP_NONZERO_PATTERN:
877     a->keepnonzeropattern = flg;
878     break;
879   case MAT_NEW_NONZERO_LOCATIONS:
880     a->nonew = (flg ? 0 : 1);
881     break;
882   case MAT_NEW_NONZERO_LOCATION_ERR:
883     a->nonew = (flg ? -1 : 0);
884     break;
885   case MAT_NEW_NONZERO_ALLOCATION_ERR:
886     a->nonew = (flg ? -2 : 0);
887     break;
888   case MAT_UNUSED_NONZERO_LOCATION_ERR:
889     a->nounused = (flg ? -1 : 0);
890     break;
891   case MAT_NEW_DIAGONALS:
892   case MAT_IGNORE_OFF_PROC_ENTRIES:
893   case MAT_USE_HASH_TABLE:
894     ierr = PetscInfo1(A,"Option %s ignored\n",MatOptions[op]);CHKERRQ(ierr);
895     break;
896   case MAT_SPD:
897   case MAT_SYMMETRIC:
898   case MAT_STRUCTURALLY_SYMMETRIC:
899   case MAT_HERMITIAN:
900   case MAT_SYMMETRY_ETERNAL:
901     /* These options are handled directly by MatSetOption() */
902     break;
903   default:
904     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
905   }
906   PetscFunctionReturn(0);
907 }
908 
909 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
910 {
911   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
912   PetscInt       i,j,n,shift;
913   PetscScalar    *x,zero=0.0;
914   PetscErrorCode ierr;
915 
916   PetscFunctionBegin;
917   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
918   if (n != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
919 
920   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
921     PetscInt *diag=a->diag;
922     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
923     for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
924     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
925     PetscFunctionReturn(0);
926   }
927 
928   ierr = VecSet(v,zero);CHKERRQ(ierr);
929   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
930   for (i=0; i<n; i++) { /* loop over rows */
931     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
932     x[i] = 0;
933     for (j=0; j<a->rlen[i]; j++) {
934       if (a->colidx[shift+j*8] == i) {
935         x[i] = a->val[shift+j*8];
936         break;
937       }
938     }
939   }
940   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
941   PetscFunctionReturn(0);
942 }
943 
944 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
945 {
946   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
947   const PetscScalar *l,*r;
948   PetscInt          i,j,m,n,row;
949   PetscErrorCode    ierr;
950 
951   PetscFunctionBegin;
952   if (ll) {
953     /* The local size is used so that VecMPI can be passed to this routine
954        by MatDiagonalScale_MPISELL */
955     ierr = VecGetLocalSize(ll,&m);CHKERRQ(ierr);
956     if (m != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
957     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
958     for (i=0; i<a->totalslices; i++) { /* loop over slices */
959       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
960         a->val[j] *= l[8*i+row];
961       }
962     }
963     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
964     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
965   }
966   if (rr) {
967     ierr = VecGetLocalSize(rr,&n);CHKERRQ(ierr);
968     if (n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
969     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
970     for (i=0; i<a->totalslices; i++) { /* loop over slices */
971       for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
972         a->val[j] *= r[a->colidx[j]];
973       }
974     }
975     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
976     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
977   }
978   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
979   PetscFunctionReturn(0);
980 }
981 
982 extern PetscErrorCode MatSetValues_SeqSELL(Mat,PetscInt,const PetscInt[],PetscInt,const PetscInt[],const PetscScalar[],InsertMode);
983 
984 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
985 {
986   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
987   PetscInt    *cp,i,k,low,high,t,row,col,l;
988   PetscInt    shift;
989   MatScalar   *vp;
990 
991   PetscFunctionBegin;
992   for (k=0; k<m; k++) { /* loop over requested rows */
993     row = im[k];
994     if (row<0) continue;
995 #if defined(PETSC_USE_DEBUG)
996     if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
997 #endif
998     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
999     cp = a->colidx+shift; /* pointer to the row */
1000     vp = a->val+shift; /* pointer to the row */
1001     for (l=0; l<n; l++) { /* loop over requested columns */
1002       col = in[l];
1003       if (col<0) continue;
1004 #if defined(PETSC_USE_DEBUG)
1005       if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %D max %D",col,A->cmap->n-1);
1006 #endif
1007       high = a->rlen[row]; low = 0; /* assume unsorted */
1008       while (high-low > 5) {
1009         t = (low+high)/2;
1010         if (*(cp+t*8) > col) high = t;
1011         else low = t;
1012       }
1013       for (i=low; i<high; i++) {
1014         if (*(cp+8*i) > col) break;
1015         if (*(cp+8*i) == col) {
1016           *v++ = *(vp+8*i);
1017           goto finished;
1018         }
1019       }
1020       *v++ = 0.0;
1021     finished:;
1022     }
1023   }
1024   PetscFunctionReturn(0);
1025 }
1026 
1027 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1028 {
1029   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1030   PetscInt          i,j,m=A->rmap->n,shift;
1031   const char        *name;
1032   PetscViewerFormat format;
1033   PetscErrorCode    ierr;
1034 
1035   PetscFunctionBegin;
1036   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1037   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1038     PetscInt nofinalvalue = 0;
1039     /*
1040     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1041       nofinalvalue = 1;
1042     }
1043     */
1044     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1045     ierr = PetscViewerASCIIPrintf(viewer,"%% Size = %D %D \n",m,A->cmap->n);CHKERRQ(ierr);
1046     ierr = PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %D \n",a->nz);CHKERRQ(ierr);
1047 #if defined(PETSC_USE_COMPLEX)
1048     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,4);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1049 #else
1050     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,3);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1051 #endif
1052     ierr = PetscViewerASCIIPrintf(viewer,"zzz = [\n");CHKERRQ(ierr);
1053 
1054     for (i=0; i<m; i++) {
1055       shift = a->sliidx[i>>3]+(i&0x07);
1056       for (j=0; j<a->rlen[i]; j++) {
1057 #if defined(PETSC_USE_COMPLEX)
1058         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1059 #else
1060         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1061 #endif
1062       }
1063     }
1064     /*
1065     if (nofinalvalue) {
1066 #if defined(PETSC_USE_COMPLEX)
1067       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",m,A->cmap->n,0.,0.);CHKERRQ(ierr);
1068 #else
1069       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",m,A->cmap->n,0.0);CHKERRQ(ierr);
1070 #endif
1071     }
1072     */
1073     ierr = PetscObjectGetName((PetscObject)A,&name);CHKERRQ(ierr);
1074     ierr = PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name);CHKERRQ(ierr);
1075     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1076   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1077     PetscFunctionReturn(0);
1078   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1079     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1080     for (i=0; i<m; i++) {
1081       ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1082       shift = a->sliidx[i>>3]+(i&0x07);
1083       for (j=0; j<a->rlen[i]; j++) {
1084 #if defined(PETSC_USE_COMPLEX)
1085         if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1086           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1087         } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1088           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1089         } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1090           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1091         }
1092 #else
1093         if (a->val[shift+8*j] != 0.0) {ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);}
1094 #endif
1095       }
1096       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1097     }
1098     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1099   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1100     PetscInt    cnt=0,jcnt;
1101     PetscScalar value;
1102 #if defined(PETSC_USE_COMPLEX)
1103     PetscBool   realonly=PETSC_TRUE;
1104     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1105       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1106         realonly = PETSC_FALSE;
1107         break;
1108       }
1109     }
1110 #endif
1111 
1112     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1113     for (i=0; i<m; i++) {
1114       jcnt = 0;
1115       shift = a->sliidx[i>>3]+(i&0x07);
1116       for (j=0; j<A->cmap->n; j++) {
1117         if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1118           value = a->val[cnt++];
1119           jcnt++;
1120         } else {
1121           value = 0.0;
1122         }
1123 #if defined(PETSC_USE_COMPLEX)
1124         if (realonly) {
1125           ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value));CHKERRQ(ierr);
1126         } else {
1127           ierr = PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value));CHKERRQ(ierr);
1128         }
1129 #else
1130         ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value);CHKERRQ(ierr);
1131 #endif
1132       }
1133       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1134     }
1135     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1136   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1137     PetscInt fshift=1;
1138     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1139 #if defined(PETSC_USE_COMPLEX)
1140     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n");CHKERRQ(ierr);
1141 #else
1142     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n");CHKERRQ(ierr);
1143 #endif
1144     ierr = PetscViewerASCIIPrintf(viewer,"%D %D %D\n", m, A->cmap->n, a->nz);CHKERRQ(ierr);
1145     for (i=0; i<m; i++) {
1146       shift = a->sliidx[i>>3]+(i&0x07);
1147       for (j=0; j<a->rlen[i]; j++) {
1148 #if defined(PETSC_USE_COMPLEX)
1149         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1150 #else
1151         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1152 #endif
1153       }
1154     }
1155     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1156   } else if (format == PETSC_VIEWER_NATIVE) {
1157     for (i=0; i<a->totalslices; i++) { /* loop over slices */
1158       PetscInt row;
1159       ierr = PetscViewerASCIIPrintf(viewer,"slice %D: %D %D\n",i,a->sliidx[i],a->sliidx[i+1]);CHKERRQ(ierr);
1160       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1161 #if defined(PETSC_USE_COMPLEX)
1162         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1163           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1164         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1165           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1166         } else {
1167           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1168         }
1169 #else
1170         ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1171 #endif
1172       }
1173     }
1174   } else {
1175     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1176     if (A->factortype) {
1177       for (i=0; i<m; i++) {
1178         shift = a->sliidx[i>>3]+(i&0x07);
1179         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1180         /* L part */
1181         for (j=shift; j<a->diag[i]; j+=8) {
1182 #if defined(PETSC_USE_COMPLEX)
1183           if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1184             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1185           } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1186             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1187           } else {
1188             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1189           }
1190 #else
1191           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1192 #endif
1193         }
1194         /* diagonal */
1195         j = a->diag[i];
1196 #if defined(PETSC_USE_COMPLEX)
1197         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1198           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]));CHKERRQ(ierr);
1199         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1200           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])));CHKERRQ(ierr);
1201         } else {
1202           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]));CHKERRQ(ierr);
1203         }
1204 #else
1205         ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)(1.0/a->val[j]));CHKERRQ(ierr);
1206 #endif
1207 
1208         /* U part */
1209         for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1210 #if defined(PETSC_USE_COMPLEX)
1211           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1212             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1213           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1214             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1215           } else {
1216             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1217           }
1218 #else
1219           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1220 #endif
1221         }
1222         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1223       }
1224     } else {
1225       for (i=0; i<m; i++) {
1226         shift = a->sliidx[i>>3]+(i&0x07);
1227         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1228         for (j=0; j<a->rlen[i]; j++) {
1229 #if defined(PETSC_USE_COMPLEX)
1230           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1231             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1232           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1233             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1234           } else {
1235             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1236           }
1237 #else
1238           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);
1239 #endif
1240         }
1241         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1242       }
1243     }
1244     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1245   }
1246   ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
1247   PetscFunctionReturn(0);
1248 }
1249 
1250 #include <petscdraw.h>
1251 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1252 {
1253   Mat               A=(Mat)Aa;
1254   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1255   PetscInt          i,j,m=A->rmap->n,shift;
1256   int               color;
1257   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1258   PetscViewer       viewer;
1259   PetscViewerFormat format;
1260   PetscErrorCode    ierr;
1261 
1262   PetscFunctionBegin;
1263   ierr = PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer);CHKERRQ(ierr);
1264   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1265   ierr = PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr);CHKERRQ(ierr);
1266 
1267   /* loop over matrix elements drawing boxes */
1268 
1269   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1270     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1271     /* Blue for negative, Cyan for zero and  Red for positive */
1272     color = PETSC_DRAW_BLUE;
1273     for (i=0; i<m; i++) {
1274       shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1275       y_l = m - i - 1.0; y_r = y_l + 1.0;
1276       for (j=0; j<a->rlen[i]; j++) {
1277         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1278         if (PetscRealPart(a->val[shift+8*j]) >=  0.) continue;
1279         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1280       }
1281     }
1282     color = PETSC_DRAW_CYAN;
1283     for (i=0; i<m; i++) {
1284       shift = a->sliidx[i>>3]+(i&0x07);
1285       y_l = m - i - 1.0; y_r = y_l + 1.0;
1286       for (j=0; j<a->rlen[i]; j++) {
1287         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1288         if (a->val[shift+8*j] !=  0.) continue;
1289         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1290       }
1291     }
1292     color = PETSC_DRAW_RED;
1293     for (i=0; i<m; i++) {
1294       shift = a->sliidx[i>>3]+(i&0x07);
1295       y_l = m - i - 1.0; y_r = y_l + 1.0;
1296       for (j=0; j<a->rlen[i]; j++) {
1297         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1298         if (PetscRealPart(a->val[shift+8*j]) <=  0.) continue;
1299         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1300       }
1301     }
1302     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1303   } else {
1304     /* use contour shading to indicate magnitude of values */
1305     /* first determine max of all nonzero values */
1306     PetscReal minv=0.0,maxv=0.0;
1307     PetscInt  count=0;
1308     PetscDraw popup;
1309     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1310       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1311     }
1312     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1313     ierr = PetscDrawGetPopup(draw,&popup);CHKERRQ(ierr);
1314     ierr = PetscDrawScalePopup(popup,minv,maxv);CHKERRQ(ierr);
1315 
1316     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1317     for (i=0; i<m; i++) {
1318       shift = a->sliidx[i>>3]+(i&0x07);
1319       y_l = m - i - 1.0;
1320       y_r = y_l + 1.0;
1321       for (j=0; j<a->rlen[i]; j++) {
1322         x_l = a->colidx[shift+j*8];
1323         x_r = x_l + 1.0;
1324         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1325         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1326         count++;
1327       }
1328     }
1329     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1330   }
1331   PetscFunctionReturn(0);
1332 }
1333 
1334 #include <petscdraw.h>
1335 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1336 {
1337   PetscDraw      draw;
1338   PetscReal      xr,yr,xl,yl,h,w;
1339   PetscBool      isnull;
1340   PetscErrorCode ierr;
1341 
1342   PetscFunctionBegin;
1343   ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
1344   ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr);
1345   if (isnull) PetscFunctionReturn(0);
1346 
1347   xr   = A->cmap->n; yr  = A->rmap->n; h = yr/10.0; w = xr/10.0;
1348   xr  += w;          yr += h;         xl = -w;     yl = -h;
1349   ierr = PetscDrawSetCoordinates(draw,xl,yl,xr,yr);CHKERRQ(ierr);
1350   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer);CHKERRQ(ierr);
1351   ierr = PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A);CHKERRQ(ierr);
1352   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL);CHKERRQ(ierr);
1353   ierr = PetscDrawSave(draw);CHKERRQ(ierr);
1354   PetscFunctionReturn(0);
1355 }
1356 
1357 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1358 {
1359   PetscBool      iascii,isbinary,isdraw;
1360   PetscErrorCode ierr;
1361 
1362   PetscFunctionBegin;
1363   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
1364   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary);CHKERRQ(ierr);
1365   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw);CHKERRQ(ierr);
1366   if (iascii) {
1367     ierr = MatView_SeqSELL_ASCII(A,viewer);CHKERRQ(ierr);
1368   } else if (isbinary) {
1369     /* ierr = MatView_SeqSELL_Binary(A,viewer);CHKERRQ(ierr); */
1370   } else if (isdraw) {
1371     ierr = MatView_SeqSELL_Draw(A,viewer);CHKERRQ(ierr);
1372   }
1373   PetscFunctionReturn(0);
1374 }
1375 
1376 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1377 {
1378   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1379   PetscInt       i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1380   MatScalar      *vp;
1381   PetscErrorCode ierr;
1382 
1383   PetscFunctionBegin;
1384   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1385   /* To do: compress out the unused elements */
1386   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
1387   ierr = PetscInfo6(A,"Matrix size: %D X %D; storage space: %D allocated %D used (%D nonzeros+%D paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz);CHKERRQ(ierr);
1388   ierr = PetscInfo1(A,"Number of mallocs during MatSetValues() is %D\n",a->reallocs);CHKERRQ(ierr);
1389   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rlenmax);CHKERRQ(ierr);
1390   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1391   for (i=0; i<a->totalslices; ++i) {
1392     shift = a->sliidx[i];    /* starting index of the slice */
1393     cp    = a->colidx+shift; /* pointer to the column indices of the slice */
1394     vp    = a->val+shift;    /* pointer to the nonzero values of the slice */
1395     for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1396       row  = 8*i + row_in_slice;
1397       nrow = a->rlen[row]; /* number of nonzeros in row */
1398       /*
1399         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1400         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1401       */
1402       lastcol = 0;
1403       if (nrow>0) { /* nonempty row */
1404         lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1405       } else if (!row_in_slice) { /* first row of the currect slice is empty */
1406         for (j=1;j<8;j++) {
1407           if (a->rlen[8*i+j]) {
1408             lastcol = cp[j];
1409             break;
1410           }
1411         }
1412       } else {
1413         if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1414       }
1415 
1416       for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1417         cp[8*k+row_in_slice] = lastcol;
1418         vp[8*k+row_in_slice] = (MatScalar)0;
1419       }
1420     }
1421   }
1422 
1423   A->info.mallocs += a->reallocs;
1424   a->reallocs      = 0;
1425 
1426   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1427   PetscFunctionReturn(0);
1428 }
1429 
1430 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1431 {
1432   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1433 
1434   PetscFunctionBegin;
1435   info->block_size   = 1.0;
1436   info->nz_allocated = (double)a->maxallocmat;
1437   info->nz_used      = (double)a->sliidx[a->totalslices]; /* include padding zeros */
1438   info->nz_unneeded  = (double)(a->maxallocmat-a->sliidx[a->totalslices]);
1439   info->assemblies   = (double)A->num_ass;
1440   info->mallocs      = (double)A->info.mallocs;
1441   info->memory       = ((PetscObject)A)->mem;
1442   if (A->factortype) {
1443     info->fill_ratio_given  = A->info.fill_ratio_given;
1444     info->fill_ratio_needed = A->info.fill_ratio_needed;
1445     info->factor_mallocs    = A->info.factor_mallocs;
1446   } else {
1447     info->fill_ratio_given  = 0;
1448     info->fill_ratio_needed = 0;
1449     info->factor_mallocs    = 0;
1450   }
1451   PetscFunctionReturn(0);
1452 }
1453 
1454 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1455 {
1456   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1457   PetscInt       shift,i,k,l,low,high,t,ii,row,col,nrow;
1458   PetscInt       *cp,nonew=a->nonew,lastcol=-1;
1459   MatScalar      *vp,value;
1460   PetscErrorCode ierr;
1461 
1462   PetscFunctionBegin;
1463   for (k=0; k<m; k++) { /* loop over added rows */
1464     row = im[k];
1465     if (row < 0) continue;
1466 #if defined(PETSC_USE_DEBUG)
1467     if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1468 #endif
1469     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1470     cp    = a->colidx+shift; /* pointer to the row */
1471     vp    = a->val+shift; /* pointer to the row */
1472     nrow  = a->rlen[row];
1473     low   = 0;
1474     high  = nrow;
1475 
1476     for (l=0; l<n; l++) { /* loop over added columns */
1477       col = in[l];
1478       if (col<0) continue;
1479 #if defined(PETSC_USE_DEBUG)
1480       if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %D max %D",col,A->cmap->n-1);
1481 #endif
1482       if (a->roworiented) {
1483         value = v[l+k*n];
1484       } else {
1485         value = v[k+l*m];
1486       }
1487       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1488 
1489       /* search in this row for the specified colmun, i indicates the column to be set */
1490       if (col <= lastcol) low = 0;
1491       else high = nrow;
1492       lastcol = col;
1493       while (high-low > 5) {
1494         t = (low+high)/2;
1495         if (*(cp+t*8) > col) high = t;
1496         else low = t;
1497       }
1498       for (i=low; i<high; i++) {
1499         if (*(cp+i*8) > col) break;
1500         if (*(cp+i*8) == col) {
1501           if (is == ADD_VALUES) *(vp+i*8) += value;
1502           else *(vp+i*8) = value;
1503           low = i + 1;
1504           goto noinsert;
1505         }
1506       }
1507       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1508       if (nonew == 1) goto noinsert;
1509       if (nonew == -1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%D, %D) in the matrix", row, col);
1510       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1511       MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1512       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1513       for (ii=nrow-1; ii>=i; ii--) {
1514         *(cp+(ii+1)*8) = *(cp+ii*8);
1515         *(vp+(ii+1)*8) = *(vp+ii*8);
1516       }
1517       a->rlen[row]++;
1518       *(cp+i*8) = col;
1519       *(vp+i*8) = value;
1520       a->nz++;
1521       A->nonzerostate++;
1522       low = i+1; high++; nrow++;
1523 noinsert:;
1524     }
1525     a->rlen[row] = nrow;
1526   }
1527   PetscFunctionReturn(0);
1528 }
1529 
1530 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1531 {
1532   PetscErrorCode ierr;
1533 
1534   PetscFunctionBegin;
1535   /* If the two matrices have the same copy implementation, use fast copy. */
1536   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1537     Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1538     Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;
1539 
1540     if (a->sliidx[a->totalslices] != b->sliidx[b->totalslices]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different");
1541     ierr = PetscMemcpy(b->val,a->val,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1542   } else {
1543     ierr = MatCopy_Basic(A,B,str);CHKERRQ(ierr);
1544   }
1545   PetscFunctionReturn(0);
1546 }
1547 
1548 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1549 {
1550   PetscErrorCode ierr;
1551 
1552   PetscFunctionBegin;
1553   ierr = MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,0);CHKERRQ(ierr);
1554   PetscFunctionReturn(0);
1555 }
1556 
1557 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1558 {
1559   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1560 
1561   PetscFunctionBegin;
1562   *array = a->val;
1563   PetscFunctionReturn(0);
1564 }
1565 
1566 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1567 {
1568   PetscFunctionBegin;
1569   PetscFunctionReturn(0);
1570 }
1571 
1572 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1573 {
1574   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1575   PetscInt    i;
1576   MatScalar   *aval=a->val;
1577 
1578   PetscFunctionBegin;
1579   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1580   PetscFunctionReturn(0);
1581 }
1582 
1583 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1584 {
1585   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1586   PetscInt       i;
1587   MatScalar      *aval=a->val;
1588   PetscErrorCode ierr;
1589 
1590   PetscFunctionBegin;
1591   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1592   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1593   PetscFunctionReturn(0);
1594 }
1595 
1596 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1597 {
1598   Mat_SeqSELL    *a=(Mat_SeqSELL*)inA->data;
1599   MatScalar      *aval=a->val;
1600   PetscScalar    oalpha=alpha;
1601   PetscBLASInt   one=1,size;
1602   PetscErrorCode ierr;
1603 
1604   PetscFunctionBegin;
1605   ierr = PetscBLASIntCast(a->sliidx[a->totalslices],&size);CHKERRQ(ierr);
1606   PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1607   ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1608   ierr = MatSeqSELLInvalidateDiagonal(inA);CHKERRQ(ierr);
1609   PetscFunctionReturn(0);
1610 }
1611 
1612 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1613 {
1614   Mat_SeqSELL    *y=(Mat_SeqSELL*)Y->data;
1615   PetscErrorCode ierr;
1616 
1617   PetscFunctionBegin;
1618   if (!Y->preallocated || !y->nz) {
1619     ierr = MatSeqSELLSetPreallocation(Y,1,NULL);CHKERRQ(ierr);
1620   }
1621   ierr = MatShift_Basic(Y,a);CHKERRQ(ierr);
1622   PetscFunctionReturn(0);
1623 }
1624 
1625 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1626 {
1627   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1628   PetscScalar       *x,sum,*t;
1629   const MatScalar   *idiag=0,*mdiag;
1630   const PetscScalar *b,*xb;
1631   PetscInt          n,m=A->rmap->n,i,j,shift;
1632   const PetscInt    *diag;
1633   PetscErrorCode    ierr;
1634 
1635   PetscFunctionBegin;
1636   its = its*lits;
1637 
1638   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1639   if (!a->idiagvalid) {ierr = MatInvertDiagonal_SeqSELL(A,omega,fshift);CHKERRQ(ierr);}
1640   a->fshift = fshift;
1641   a->omega  = omega;
1642 
1643   diag  = a->diag;
1644   t     = a->ssor_work;
1645   idiag = a->idiag;
1646   mdiag = a->mdiag;
1647 
1648   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1649   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1650   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1651   if (flag == SOR_APPLY_UPPER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented");
1652   if (flag == SOR_APPLY_LOWER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented");
1653   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
1654 
1655   if (flag & SOR_ZERO_INITIAL_GUESS) {
1656     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1657       for (i=0; i<m; i++) {
1658         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1659         sum   = b[i];
1660         n     = (diag[i]-shift)/8;
1661         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1662         t[i]  = sum;
1663         x[i]  = sum*idiag[i];
1664       }
1665       xb   = t;
1666       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1667     } else xb = b;
1668     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1669       for (i=m-1; i>=0; i--) {
1670         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1671         sum   = xb[i];
1672         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1673         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1674         if (xb == b) {
1675           x[i] = sum*idiag[i];
1676         } else {
1677           x[i] = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1678         }
1679       }
1680       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1681     }
1682     its--;
1683   }
1684   while (its--) {
1685     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1686       for (i=0; i<m; i++) {
1687         /* lower */
1688         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1689         sum   = b[i];
1690         n     = (diag[i]-shift)/8;
1691         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1692         t[i]  = sum;             /* save application of the lower-triangular part */
1693         /* upper */
1694         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1695         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1696         x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1697       }
1698       xb   = t;
1699       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1700     } else xb = b;
1701     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1702       for (i=m-1; i>=0; i--) {
1703         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1704         sum = xb[i];
1705         if (xb == b) {
1706           /* whole matrix (no checkpointing available) */
1707           n     = a->rlen[i];
1708           for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1709           x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1710         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1711           n     = a->rlen[i]-(diag[i]-shift)/8-1;
1712           for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1713           x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1714         }
1715       }
1716       if (xb == b) {
1717         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1718       } else {
1719         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1720       }
1721     }
1722   }
1723   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1724   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1725   PetscFunctionReturn(0);
1726 }
1727 
1728 /* -------------------------------------------------------------------*/
1729 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1730                                        MatGetRow_SeqSELL,
1731                                        MatRestoreRow_SeqSELL,
1732                                        MatMult_SeqSELL,
1733                                /* 4*/  MatMultAdd_SeqSELL,
1734                                        MatMultTranspose_SeqSELL,
1735                                        MatMultTransposeAdd_SeqSELL,
1736                                        0,
1737                                        0,
1738                                        0,
1739                                /* 10*/ 0,
1740                                        0,
1741                                        0,
1742                                        MatSOR_SeqSELL,
1743                                        0,
1744                                /* 15*/ MatGetInfo_SeqSELL,
1745                                        MatEqual_SeqSELL,
1746                                        MatGetDiagonal_SeqSELL,
1747                                        MatDiagonalScale_SeqSELL,
1748                                        0,
1749                                /* 20*/ 0,
1750                                        MatAssemblyEnd_SeqSELL,
1751                                        MatSetOption_SeqSELL,
1752                                        MatZeroEntries_SeqSELL,
1753                                /* 24*/ 0,
1754                                        0,
1755                                        0,
1756                                        0,
1757                                        0,
1758                                /* 29*/ MatSetUp_SeqSELL,
1759                                        0,
1760                                        0,
1761                                        0,
1762                                        0,
1763                                /* 34*/ MatDuplicate_SeqSELL,
1764                                        0,
1765                                        0,
1766                                        0,
1767                                        0,
1768                                /* 39*/ 0,
1769                                        0,
1770                                        0,
1771                                        MatGetValues_SeqSELL,
1772                                        MatCopy_SeqSELL,
1773                                /* 44*/ 0,
1774                                        MatScale_SeqSELL,
1775                                        MatShift_SeqSELL,
1776                                        0,
1777                                        0,
1778                                /* 49*/ 0,
1779                                        0,
1780                                        0,
1781                                        0,
1782                                        0,
1783                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
1784                                        0,
1785                                        0,
1786                                        0,
1787                                        0,
1788                                /* 59*/ 0,
1789                                        MatDestroy_SeqSELL,
1790                                        MatView_SeqSELL,
1791                                        0,
1792                                        0,
1793                                /* 64*/ 0,
1794                                        0,
1795                                        0,
1796                                        0,
1797                                        0,
1798                                /* 69*/ 0,
1799                                        0,
1800                                        0,
1801                                        0,
1802                                        0,
1803                                /* 74*/ 0,
1804                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1805                                        0,
1806                                        0,
1807                                        0,
1808                                /* 79*/ 0,
1809                                        0,
1810                                        0,
1811                                        0,
1812                                        0,
1813                                /* 84*/ 0,
1814                                        0,
1815                                        0,
1816                                        0,
1817                                        0,
1818                                /* 89*/ 0,
1819                                        0,
1820                                        0,
1821                                        0,
1822                                        0,
1823                                /* 94*/ 0,
1824                                        0,
1825                                        0,
1826                                        0,
1827                                        0,
1828                                /* 99*/ 0,
1829                                        0,
1830                                        0,
1831                                        MatConjugate_SeqSELL,
1832                                        0,
1833                                /*104*/ 0,
1834                                        0,
1835                                        0,
1836                                        0,
1837                                        0,
1838                                /*109*/ 0,
1839                                        0,
1840                                        0,
1841                                        0,
1842                                        MatMissingDiagonal_SeqSELL,
1843                                /*114*/ 0,
1844                                        0,
1845                                        0,
1846                                        0,
1847                                        0,
1848                                /*119*/ 0,
1849                                        0,
1850                                        0,
1851                                        0,
1852                                        0,
1853                                /*124*/ 0,
1854                                        0,
1855                                        0,
1856                                        0,
1857                                        0,
1858                                /*129*/ 0,
1859                                        0,
1860                                        0,
1861                                        0,
1862                                        0,
1863                                /*134*/ 0,
1864                                        0,
1865                                        0,
1866                                        0,
1867                                        0,
1868                                /*139*/ 0,
1869                                        0,
1870                                        0,
1871                                        MatFDColoringSetUp_SeqXAIJ,
1872                                        0,
1873                                 /*144*/0
1874 };
1875 
1876 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1877 {
1878   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1879   PetscErrorCode ierr;
1880 
1881   PetscFunctionBegin;
1882   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1883 
1884   /* allocate space for values if not already there */
1885   if (!a->saved_values) {
1886     ierr = PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values);CHKERRQ(ierr);
1887     ierr = PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1888   }
1889 
1890   /* copy values over */
1891   ierr = PetscMemcpy(a->saved_values,a->val,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1892   PetscFunctionReturn(0);
1893 }
1894 
1895 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1896 {
1897   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1898   PetscErrorCode ierr;
1899 
1900   PetscFunctionBegin;
1901   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1902   if (!a->saved_values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first");
1903   /* copy values over */
1904   ierr = PetscMemcpy(a->val,a->saved_values,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1905   PetscFunctionReturn(0);
1906 }
1907 
1908 /*@C
1909  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1910 
1911  Not Collective
1912 
1913  Input Parameters:
1914  .  mat - a MATSEQSELL matrix
1915  .  array - pointer to the data
1916 
1917  Level: intermediate
1918 
1919  .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90()
1920  @*/
1921 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1922 {
1923   PetscErrorCode ierr;
1924 
1925   PetscFunctionBegin;
1926   ierr = PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));CHKERRQ(ierr);
1927   PetscFunctionReturn(0);
1928 }
1929 
1930 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1931 {
1932   Mat_SeqSELL    *b;
1933   PetscMPIInt    size;
1934   PetscErrorCode ierr;
1935 
1936   PetscFunctionBegin;
1937   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr);
1938   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1");
1939 
1940   ierr = PetscNewLog(B,&b);CHKERRQ(ierr);
1941 
1942   B->data = (void*)b;
1943 
1944   ierr = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1945 
1946   b->row                = 0;
1947   b->col                = 0;
1948   b->icol               = 0;
1949   b->reallocs           = 0;
1950   b->ignorezeroentries  = PETSC_FALSE;
1951   b->roworiented        = PETSC_TRUE;
1952   b->nonew              = 0;
1953   b->diag               = 0;
1954   b->solve_work         = 0;
1955   B->spptr              = 0;
1956   b->saved_values       = 0;
1957   b->idiag              = 0;
1958   b->mdiag              = 0;
1959   b->ssor_work          = 0;
1960   b->omega              = 1.0;
1961   b->fshift             = 0.0;
1962   b->idiagvalid         = PETSC_FALSE;
1963   b->ibdiagvalid        = PETSC_FALSE;
1964   b->keepnonzeropattern = PETSC_FALSE;
1965 
1966   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL);CHKERRQ(ierr);
1967   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL);CHKERRQ(ierr);
1968   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL);CHKERRQ(ierr);
1969   ierr = PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL);CHKERRQ(ierr);
1970   ierr = PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL);CHKERRQ(ierr);
1971   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL);CHKERRQ(ierr);
1972   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ);CHKERRQ(ierr);
1973   PetscFunctionReturn(0);
1974 }
1975 
1976 /*
1977  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1978  */
1979 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
1980 {
1981   Mat_SeqSELL    *c,*a=(Mat_SeqSELL*)A->data;
1982   PetscInt       i,m=A->rmap->n;
1983   PetscInt       totalslices=a->totalslices;
1984   PetscErrorCode ierr;
1985 
1986   PetscFunctionBegin;
1987   c = (Mat_SeqSELL*)C->data;
1988 
1989   C->factortype = A->factortype;
1990   c->row        = 0;
1991   c->col        = 0;
1992   c->icol       = 0;
1993   c->reallocs   = 0;
1994 
1995   C->assembled = PETSC_TRUE;
1996 
1997   ierr = PetscLayoutReference(A->rmap,&C->rmap);CHKERRQ(ierr);
1998   ierr = PetscLayoutReference(A->cmap,&C->cmap);CHKERRQ(ierr);
1999 
2000   ierr = PetscMalloc1(8*totalslices,&c->rlen);CHKERRQ(ierr);
2001   ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2002   ierr = PetscMalloc1(totalslices+1,&c->sliidx);CHKERRQ(ierr);
2003   ierr = PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
2004 
2005   for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
2006   for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];
2007 
2008   /* allocate the matrix space */
2009   if (mallocmatspace) {
2010     ierr = PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx);CHKERRQ(ierr);
2011     ierr = PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
2012 
2013     c->singlemalloc = PETSC_TRUE;
2014 
2015     if (m > 0) {
2016       ierr = PetscMemcpy(c->colidx,a->colidx,(a->maxallocmat)*sizeof(PetscInt));CHKERRQ(ierr);
2017       if (cpvalues == MAT_COPY_VALUES) {
2018         ierr = PetscMemcpy(c->val,a->val,a->maxallocmat*sizeof(PetscScalar));CHKERRQ(ierr);
2019       } else {
2020         ierr = PetscMemzero(c->val,a->maxallocmat*sizeof(PetscScalar));CHKERRQ(ierr);
2021       }
2022     }
2023   }
2024 
2025   c->ignorezeroentries = a->ignorezeroentries;
2026   c->roworiented       = a->roworiented;
2027   c->nonew             = a->nonew;
2028   if (a->diag) {
2029     ierr = PetscMalloc1(m,&c->diag);CHKERRQ(ierr);
2030     ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2031     for (i=0; i<m; i++) {
2032       c->diag[i] = a->diag[i];
2033     }
2034   } else c->diag = 0;
2035 
2036   c->solve_work         = 0;
2037   c->saved_values       = 0;
2038   c->idiag              = 0;
2039   c->ssor_work          = 0;
2040   c->keepnonzeropattern = a->keepnonzeropattern;
2041   c->free_val           = PETSC_TRUE;
2042   c->free_colidx        = PETSC_TRUE;
2043 
2044   c->maxallocmat  = a->maxallocmat;
2045   c->maxallocrow  = a->maxallocrow;
2046   c->rlenmax      = a->rlenmax;
2047   c->nz           = a->nz;
2048   C->preallocated = PETSC_TRUE;
2049 
2050   c->nonzerorowcnt = a->nonzerorowcnt;
2051   C->nonzerostate  = A->nonzerostate;
2052 
2053   ierr = PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist);CHKERRQ(ierr);
2054   PetscFunctionReturn(0);
2055 }
2056 
2057 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2058 {
2059   PetscErrorCode ierr;
2060 
2061   PetscFunctionBegin;
2062   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
2063   ierr = MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);CHKERRQ(ierr);
2064   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2065     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
2066   }
2067   ierr = MatSetType(*B,((PetscObject)A)->type_name);CHKERRQ(ierr);
2068   ierr = MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE);CHKERRQ(ierr);
2069   PetscFunctionReturn(0);
2070 }
2071 
2072 /*@C
2073  MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2074 
2075  Collective on MPI_Comm
2076 
2077  Input Parameters:
2078  +  comm - MPI communicator, set to PETSC_COMM_SELF
2079  .  m - number of rows
2080  .  n - number of columns
2081  .  rlenmax - maximum number of nonzeros in a row
2082  -  rlen - array containing the number of nonzeros in the various rows
2083  (possibly different for each row) or NULL
2084 
2085  Output Parameter:
2086  .  A - the matrix
2087 
2088  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2089  MatXXXXSetPreallocation() paradgm instead of this routine directly.
2090  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2091 
2092  Notes:
2093  If nnz is given then nz is ignored
2094 
2095  Specify the preallocated storage with either rlenmax or rlen (not both).
2096  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2097  allocation.  For large problems you MUST preallocate memory or you
2098  will get TERRIBLE performance, see the users' manual chapter on matrices.
2099 
2100  Level: intermediate
2101 
2102  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatCreateSeqSELLWithArrays()
2103 
2104  @*/
2105 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2106 {
2107   PetscErrorCode ierr;
2108 
2109   PetscFunctionBegin;
2110   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2111   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2112   ierr = MatSetType(*A,MATSEQSELL);CHKERRQ(ierr);
2113   ierr = MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen);CHKERRQ(ierr);
2114   PetscFunctionReturn(0);
2115 }
2116 
2117 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2118 {
2119   Mat_SeqSELL     *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2120   PetscInt       totalslices=a->totalslices;
2121   PetscErrorCode ierr;
2122 
2123   PetscFunctionBegin;
2124   /* If the  matrix dimensions are not equal,or no of nonzeros */
2125   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2126     *flg = PETSC_FALSE;
2127     PetscFunctionReturn(0);
2128   }
2129   /* if the a->colidx are the same */
2130   ierr = PetscMemcmp(a->colidx,b->colidx,a->sliidx[totalslices]*sizeof(PetscInt),flg);CHKERRQ(ierr);
2131   if (!*flg) PetscFunctionReturn(0);
2132   /* if a->val are the same */
2133   ierr = PetscMemcmp(a->val,b->val,a->sliidx[totalslices]*sizeof(PetscScalar),flg);CHKERRQ(ierr);
2134   PetscFunctionReturn(0);
2135 }
2136 
2137 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2138 {
2139   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2140 
2141   PetscFunctionBegin;
2142   a->idiagvalid  = PETSC_FALSE;
2143   a->ibdiagvalid = PETSC_FALSE;
2144   PetscFunctionReturn(0);
2145 }
2146 
2147 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2148 {
2149 #if defined(PETSC_USE_COMPLEX)
2150   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2151   PetscInt    i;
2152   PetscScalar *val = a->val;
2153 
2154   PetscFunctionBegin;
2155   for (i=0; i<a->sliidx[a->totalslices]; i++) {
2156     val[i] = PetscConj(val[i]);
2157   }
2158 #else
2159   PetscFunctionBegin;
2160 #endif
2161   PetscFunctionReturn(0);
2162 }
2163