xref: /petsc/src/mat/impls/sell/seq/sell.c (revision 95c0884e6f7665b705eebf88174e89dc920c2fc0)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h>  /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
9 
10   #include <immintrin.h>
11 
12   #if !defined(_MM_SCALE_8)
13   #define _MM_SCALE_8    8
14   #endif
15 
16   #if defined(__AVX512F__)
17   /* these do not work
18    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
19    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
20   */
21     #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
22     /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
23     vec_idx  = _mm256_loadu_si256((__m256i const*)acolidx); \
24     vec_vals = _mm512_loadu_pd(aval); \
25     vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
26     vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
27   #elif defined(__AVX2__) && defined(__FMA__)
28     #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
29     vec_vals = _mm256_loadu_pd(aval); \
30     vec_idx  = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \
31     vec_x    = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
32     vec_y    = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
33   #endif
34 #endif  /* PETSC_HAVE_IMMINTRIN_H */
35 
36 /*@C
37  MatSeqSELLSetPreallocation - For good matrix assembly performance
38  the user should preallocate the matrix storage by setting the parameter nz
39  (or the array nnz).  By setting these parameters accurately, performance
40  during matrix assembly can be increased significantly.
41 
42  Collective on MPI_Comm
43 
44  Input Parameters:
45  +  B - The matrix
46  .  nz - number of nonzeros per row (same for all rows)
47  -  nnz - array containing the number of nonzeros in the various rows
48  (possibly different for each row) or NULL
49 
50  Notes:
51  If nnz is given then nz is ignored.
52 
53  Specify the preallocated storage with either nz or nnz (not both).
54  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
55  allocation.  For large problems you MUST preallocate memory or you
56  will get TERRIBLE performance, see the users' manual chapter on matrices.
57 
58  You can call MatGetInfo() to get information on how effective the preallocation was;
59  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
60  You can also run with the option -info and look for messages with the string
61  malloc in them to see if additional memory allocation was needed.
62 
63  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
64  entries or columns indices.
65 
66  The maximum number of nonzeos in any row should be as accuate as possible.
67  If it is underesitmated, you will get bad performance due to reallocation
68  (MatSeqXSELLReallocateSELL).
69 
70  Level: intermediate
71 
72  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo()
73 
74  @*/
75 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
76 {
77   PetscErrorCode ierr;
78 
79   PetscFunctionBegin;
80   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
81   PetscValidType(B,1);
82   ierr = PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));CHKERRQ(ierr);
83   PetscFunctionReturn(0);
84 }
85 
86 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
87 {
88   Mat_SeqSELL    *b;
89   PetscInt       i,j,totalslices;
90   PetscBool      skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;
91   PetscErrorCode ierr;
92 
93   PetscFunctionBegin;
94   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
95   if (maxallocrow == MAT_SKIP_ALLOCATION) {
96     skipallocation = PETSC_TRUE;
97     maxallocrow    = 0;
98   }
99 
100   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
101   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
102 
103   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
104   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
105   if (maxallocrow < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %D",maxallocrow);
106   if (rlen) {
107     for (i=0; i<B->rmap->n; i++) {
108       if (rlen[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %D value %D",i,rlen[i]);
109       if (rlen[i] > B->cmap->n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %D value %D rowlength %D",i,rlen[i],B->cmap->n);
110     }
111   }
112 
113   B->preallocated = PETSC_TRUE;
114 
115   b = (Mat_SeqSELL*)B->data;
116 
117   totalslices = B->rmap->n/8+((B->rmap->n & 0x07)?1:0); /* ceil(n/8) */
118   b->totalslices = totalslices;
119   if (!skipallocation) {
120     if (B->rmap->n & 0x07) {ierr = PetscInfo1(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %D)\n",B->rmap->n);CHKERRQ(ierr);}
121 
122     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
123       ierr = PetscMalloc1(totalslices+1,&b->sliidx);CHKERRQ(ierr);
124       ierr = PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
125     }
126     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
127       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
128       else if (maxallocrow < 0) maxallocrow = 1;
129       for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
130     } else {
131       maxallocrow = 0;
132       b->sliidx[0] = 0;
133       for (i=1; i<totalslices; i++) {
134         b->sliidx[i] = 0;
135         for (j=0;j<8;j++) {
136           b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
137         }
138         maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
139         b->sliidx[i] = b->sliidx[i-1] + 8*b->sliidx[i];
140       }
141       /* last slice */
142       b->sliidx[totalslices] = 0;
143       for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
144       maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
145       b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
146     }
147 
148     /* allocate space for val, colidx, rlen */
149     /* FIXME: should B's old memory be unlogged? */
150     ierr = MatSeqXSELLFreeSELL(B,&b->val,&b->colidx);CHKERRQ(ierr);
151     /* FIXME: assuming an element of the bit array takes 8 bits */
152     ierr = PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx);CHKERRQ(ierr);
153     ierr = PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
154     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
155     ierr = PetscCalloc1(8*totalslices,&b->rlen);CHKERRQ(ierr);
156     ierr = PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt));CHKERRQ(ierr);
157 
158     b->singlemalloc = PETSC_TRUE;
159     b->free_val     = PETSC_TRUE;
160     b->free_colidx  = PETSC_TRUE;
161   } else {
162     b->free_val    = PETSC_FALSE;
163     b->free_colidx = PETSC_FALSE;
164   }
165 
166   b->nz               = 0;
167   b->maxallocrow      = maxallocrow;
168   b->rlenmax          = maxallocrow;
169   b->maxallocmat      = b->sliidx[totalslices];
170   B->info.nz_unneeded = (double)b->maxallocmat;
171   if (realalloc) {
172     ierr = MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
173   }
174   PetscFunctionReturn(0);
175 }
176 
177 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
178 {
179   Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
180   PetscInt    shift;
181 
182   PetscFunctionBegin;
183   if (row < 0 || row >= A->rmap->n) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %D out of range",row);
184   if (nz) *nz = a->rlen[row];
185   shift = a->sliidx[row>>3]+(row&0x07);
186   if (!a->getrowcols) {
187     PetscErrorCode ierr;
188 
189     ierr = PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals);CHKERRQ(ierr);
190   }
191   if (idx) {
192     PetscInt j;
193     for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
194     *idx = a->getrowcols;
195   }
196   if (v) {
197     PetscInt j;
198     for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
199     *v = a->getrowvals;
200   }
201   PetscFunctionReturn(0);
202 }
203 
204 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
205 {
206   PetscFunctionBegin;
207   PetscFunctionReturn(0);
208 }
209 
210 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
211 {
212   Mat            B;
213   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
214   PetscInt       i;
215   PetscErrorCode ierr;
216 
217   PetscFunctionBegin;
218   if (reuse == MAT_REUSE_MATRIX) {
219     B    = *newmat;
220     ierr = MatZeroEntries(B);CHKERRQ(ierr);
221   } else {
222     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
223     ierr = MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N);CHKERRQ(ierr);
224     ierr = MatSetType(B,MATSEQAIJ);CHKERRQ(ierr);
225     ierr = MatSeqAIJSetPreallocation(B,0,a->rlen);CHKERRQ(ierr);
226   }
227 
228   for (i=0; i<A->rmap->n; i++) {
229     PetscInt    nz = 0,*cols = NULL;
230     PetscScalar *vals = NULL;
231 
232     ierr = MatGetRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
233     ierr = MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
234     ierr = MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
235   }
236 
237   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
238   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
239   B->rmap->bs = A->rmap->bs;
240 
241   if (reuse == MAT_INPLACE_MATRIX) {
242     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
243   } else {
244     *newmat = B;
245   }
246   PetscFunctionReturn(0);
247 }
248 
249 #include <../src/mat/impls/aij/seq/aij.h>
250 
251 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
252 {
253   Mat               B;
254   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
255   PetscInt          *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
256   const PetscInt    *cols;
257   const PetscScalar *vals;
258   PetscErrorCode    ierr;
259 
260   PetscFunctionBegin;
261   if (A->rmap->bs > 1) {
262     ierr = MatConvert_Basic(A,newtype,reuse,newmat);CHKERRQ(ierr);
263     PetscFunctionReturn(0);
264   }
265 
266   if (reuse == MAT_REUSE_MATRIX) {
267     B = *newmat;
268   } else {
269     /* Can we just use ilen? */
270     ierr = PetscMalloc1(m,&rowlengths);CHKERRQ(ierr);
271     for (i=0; i<m; i++) {
272       rowlengths[i] = ai[i+1] - ai[i];
273     }
274 
275     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
276     ierr = MatSetSizes(B,m,n,m,n);CHKERRQ(ierr);
277     ierr = MatSetType(B,MATSEQSELL);CHKERRQ(ierr);
278     ierr = MatSeqSELLSetPreallocation(B,0,rowlengths);CHKERRQ(ierr);
279     ierr = PetscFree(rowlengths);CHKERRQ(ierr);
280   }
281 
282   for (row=0; row<m; row++) {
283     ierr = MatGetRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
284     ierr = MatSetValues(B,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
285     ierr = MatRestoreRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
286   }
287   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
288   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
289   B->rmap->bs = A->rmap->bs;
290 
291   if (reuse == MAT_INPLACE_MATRIX) {
292     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
293   } else {
294     *newmat = B;
295   }
296   PetscFunctionReturn(0);
297 }
298 
299 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
300 {
301   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
302   PetscScalar       *y;
303   const PetscScalar *x;
304   const MatScalar   *aval=a->val;
305   PetscInt          totalslices=a->totalslices;
306   const PetscInt    *acolidx=a->colidx;
307   PetscInt          i,j;
308   PetscErrorCode    ierr;
309 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
310   __m512d           vec_x,vec_y,vec_vals;
311   __m256i           vec_idx;
312   __mmask8          mask;
313   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
314   __m256i           vec_idx2,vec_idx3,vec_idx4;
315 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
316   __m128i           vec_idx;
317   __m256d           vec_x,vec_y,vec_y2,vec_vals;
318   MatScalar         yval;
319   PetscInt          r,rows_left,row,nnz_in_row;
320 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
321   __m128d           vec_x_tmp;
322   __m256d           vec_x,vec_y,vec_y2,vec_vals;
323   MatScalar         yval;
324   PetscInt          r,rows_left,row,nnz_in_row;
325 #else
326   PetscScalar       sum[8];
327 #endif
328 
329 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
330 #pragma disjoint(*x,*y,*aval)
331 #endif
332 
333   PetscFunctionBegin;
334   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
335   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
336 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
337   for (i=0; i<totalslices; i++) { /* loop over slices */
338     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
339     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
340 
341     vec_y  = _mm512_setzero_pd();
342     vec_y2 = _mm512_setzero_pd();
343     vec_y3 = _mm512_setzero_pd();
344     vec_y4 = _mm512_setzero_pd();
345 
346     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
347     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
348     case 3:
349       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
350       acolidx += 8; aval += 8;
351       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
352       acolidx += 8; aval += 8;
353       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
354       acolidx += 8; aval += 8;
355       j += 3;
356       break;
357     case 2:
358       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
359       acolidx += 8; aval += 8;
360       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
361       acolidx += 8; aval += 8;
362       j += 2;
363       break;
364     case 1:
365       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
366       acolidx += 8; aval += 8;
367       j += 1;
368       break;
369     }
370     #pragma novector
371     for (; j<(a->sliidx[i+1]>>3); j+=4) {
372       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
373       acolidx += 8; aval += 8;
374       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
375       acolidx += 8; aval += 8;
376       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
377       acolidx += 8; aval += 8;
378       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
379       acolidx += 8; aval += 8;
380     }
381 
382     vec_y = _mm512_add_pd(vec_y,vec_y2);
383     vec_y = _mm512_add_pd(vec_y,vec_y3);
384     vec_y = _mm512_add_pd(vec_y,vec_y4);
385     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
386       mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
387       _mm512_mask_storeu_pd(&y[8*i],mask,vec_y);
388     } else {
389       _mm512_storeu_pd(&y[8*i],vec_y);
390     }
391   }
392 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
393   for (i=0; i<totalslices; i++) { /* loop over full slices */
394     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
395     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
396 
397     /* last slice may have padding rows. Don't use vectorization. */
398     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
399       rows_left = A->rmap->n - 8*i;
400       for (r=0; r<rows_left; ++r) {
401         yval = (MatScalar)0;
402         row = 8*i + r;
403         nnz_in_row = a->rlen[row];
404         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
405         y[row] = yval;
406       }
407       break;
408     }
409 
410     vec_y  = _mm256_setzero_pd();
411     vec_y2 = _mm256_setzero_pd();
412 
413     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
414     #pragma novector
415     #pragma unroll(2)
416     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
417       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
418       aval += 4; acolidx += 4;
419       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
420       aval += 4; acolidx += 4;
421     }
422 
423     _mm256_storeu_pd(y+i*8,vec_y);
424     _mm256_storeu_pd(y+i*8+4,vec_y2);
425   }
426 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
427   for (i=0; i<totalslices; i++) { /* loop over full slices */
428     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
429     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
430 
431     vec_y  = _mm256_setzero_pd();
432     vec_y2 = _mm256_setzero_pd();
433 
434     /* last slice may have padding rows. Don't use vectorization. */
435     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
436       rows_left = A->rmap->n - 8*i;
437       for (r=0; r<rows_left; ++r) {
438         yval = (MatScalar)0;
439         row = 8*i + r;
440         nnz_in_row = a->rlen[row];
441         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
442         y[row] = yval;
443       }
444       break;
445     }
446 
447     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
448     #pragma novector
449     #pragma unroll(2)
450     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
451       vec_vals  = _mm256_loadu_pd(aval);
452       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
453       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
454       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
455       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
456       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
457       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
458       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
459       aval     += 4;
460 
461       vec_vals  = _mm256_loadu_pd(aval);
462       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
463       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
464       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
465       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
466       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
467       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
468       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
469       aval     += 4;
470     }
471 
472     _mm256_storeu_pd(y + i*8,     vec_y);
473     _mm256_storeu_pd(y + i*8 + 4, vec_y2);
474   }
475 #else
476   for (i=0; i<totalslices; i++) { /* loop over slices */
477     for (j=0; j<8; j++) sum[j] = 0.0;
478     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
479       sum[0] += aval[j] * x[acolidx[j]];
480       sum[1] += aval[j+1] * x[acolidx[j+1]];
481       sum[2] += aval[j+2] * x[acolidx[j+2]];
482       sum[3] += aval[j+3] * x[acolidx[j+3]];
483       sum[4] += aval[j+4] * x[acolidx[j+4]];
484       sum[5] += aval[j+5] * x[acolidx[j+5]];
485       sum[6] += aval[j+6] * x[acolidx[j+6]];
486       sum[7] += aval[j+7] * x[acolidx[j+7]];
487     }
488     if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
489       for(j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
490     } else {
491       for(j=0; j<8; j++) y[8*i+j] = sum[j];
492     }
493   }
494 #endif
495 
496   ierr = PetscLogFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); /* theoretical minimal FLOPs */
497   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
498   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
499   PetscFunctionReturn(0);
500 }
501 
502 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
503 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
504 {
505   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
506   PetscScalar       *y,*z;
507   const PetscScalar *x;
508   const MatScalar   *aval=a->val;
509   PetscInt          totalslices=a->totalslices;
510   const PetscInt    *acolidx=a->colidx;
511   PetscInt          i,j;
512   PetscErrorCode    ierr;
513 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
514   __m512d           vec_x,vec_y,vec_vals;
515   __m256i           vec_idx;
516   __mmask8          mask;
517   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
518   __m256i           vec_idx2,vec_idx3,vec_idx4;
519 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
520   __m128d           vec_x_tmp;
521   __m256d           vec_x,vec_y,vec_y2,vec_vals;
522   MatScalar         yval;
523   PetscInt          r,row,nnz_in_row;
524 #else
525   PetscScalar       sum[8];
526 #endif
527 
528 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
529 #pragma disjoint(*x,*y,*aval)
530 #endif
531 
532   PetscFunctionBegin;
533   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
534   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
535 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
536   for (i=0; i<totalslices; i++) { /* loop over slices */
537     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
538     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
539 
540     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
541       mask   = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
542       vec_y  = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]);
543     } else {
544       vec_y  = _mm512_loadu_pd(&y[8*i]);
545     }
546     vec_y2 = _mm512_setzero_pd();
547     vec_y3 = _mm512_setzero_pd();
548     vec_y4 = _mm512_setzero_pd();
549 
550     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
551     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
552     case 3:
553       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
554       acolidx += 8; aval += 8;
555       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
556       acolidx += 8; aval += 8;
557       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
558       acolidx += 8; aval += 8;
559       j += 3;
560       break;
561     case 2:
562       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
563       acolidx += 8; aval += 8;
564       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
565       acolidx += 8; aval += 8;
566       j += 2;
567       break;
568     case 1:
569       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
570       acolidx += 8; aval += 8;
571       j += 1;
572       break;
573     }
574     #pragma novector
575     for (; j<(a->sliidx[i+1]>>3); j+=4) {
576       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
577       acolidx += 8; aval += 8;
578       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
579       acolidx += 8; aval += 8;
580       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
581       acolidx += 8; aval += 8;
582       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
583       acolidx += 8; aval += 8;
584     }
585 
586     vec_y = _mm512_add_pd(vec_y,vec_y2);
587     vec_y = _mm512_add_pd(vec_y,vec_y3);
588     vec_y = _mm512_add_pd(vec_y,vec_y4);
589     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
590       _mm512_mask_storeu_pd(&z[8*i],mask,vec_y);
591     } else {
592       _mm512_storeu_pd(&z[8*i],vec_y);
593     }
594   }
595 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
596   for (i=0; i<totalslices; i++) { /* loop over full slices */
597     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
598     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
599 
600     /* last slice may have padding rows. Don't use vectorization. */
601     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
602       for (r=0; r<(A->rmap->n & 0x07); ++r) {
603         row        = 8*i + r;
604         yval       = (MatScalar)0.0;
605         nnz_in_row = a->rlen[row];
606         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
607         z[row] = y[row] + yval;
608       }
609       break;
610     }
611 
612     vec_y  = _mm256_loadu_pd(y+8*i);
613     vec_y2 = _mm256_loadu_pd(y+8*i+4);
614 
615     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
616     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
617       vec_vals  = _mm256_loadu_pd(aval);
618       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
619       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
620       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
621       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
622       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
623       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
624       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
625       aval     += 4;
626 
627       vec_vals  = _mm256_loadu_pd(aval);
628       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
629       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
630       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
631       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
632       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
633       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
634       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
635       aval     += 4;
636     }
637 
638     _mm256_storeu_pd(z+i*8,vec_y);
639     _mm256_storeu_pd(z+i*8+4,vec_y2);
640   }
641 #else
642   for (i=0; i<totalslices; i++) { /* loop over slices */
643     for (j=0; j<8; j++) sum[j] = 0.0;
644     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
645       sum[0] += aval[j] * x[acolidx[j]];
646       sum[1] += aval[j+1] * x[acolidx[j+1]];
647       sum[2] += aval[j+2] * x[acolidx[j+2]];
648       sum[3] += aval[j+3] * x[acolidx[j+3]];
649       sum[4] += aval[j+4] * x[acolidx[j+4]];
650       sum[5] += aval[j+5] * x[acolidx[j+5]];
651       sum[6] += aval[j+6] * x[acolidx[j+6]];
652       sum[7] += aval[j+7] * x[acolidx[j+7]];
653     }
654     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
655       for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
656     } else {
657       for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
658     }
659   }
660 #endif
661 
662   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
663   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
664   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
665   PetscFunctionReturn(0);
666 }
667 
668 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
669 {
670   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
671   PetscScalar       *y;
672   const PetscScalar *x;
673   const MatScalar   *aval=a->val;
674   const PetscInt    *acolidx=a->colidx;
675   PetscInt          i,j,r,row,nnz_in_row,totalslices=a->totalslices;
676   PetscErrorCode    ierr;
677 
678 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
679 #pragma disjoint(*x,*y,*aval)
680 #endif
681 
682   PetscFunctionBegin;
683   if (A->symmetric) {
684     ierr = MatMultAdd_SeqSELL(A,xx,zz,yy);CHKERRQ(ierr);
685     PetscFunctionReturn(0);
686   }
687   if (zz != yy) { ierr = VecCopy(zz,yy);CHKERRQ(ierr); }
688   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
689   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
690   for (i=0; i<a->totalslices; i++) { /* loop over slices */
691     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
692       for (r=0; r<(A->rmap->n & 0x07); ++r) {
693         row        = 8*i + r;
694         nnz_in_row = a->rlen[row];
695         for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
696       }
697       break;
698     }
699     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
700       y[acolidx[j]]   += aval[j] * x[8*i];
701       y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
702       y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
703       y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
704       y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
705       y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
706       y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
707       y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
708     }
709   }
710   ierr = PetscLogFlops(2.0*a->sliidx[a->totalslices]);CHKERRQ(ierr);
711   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
712   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
713   PetscFunctionReturn(0);
714 }
715 
716 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
717 {
718   PetscErrorCode ierr;
719 
720   PetscFunctionBegin;
721   if (A->symmetric) {
722     ierr = MatMult_SeqSELL(A,xx,yy);CHKERRQ(ierr);
723   } else {
724     ierr = VecSet(yy,0.0);CHKERRQ(ierr);
725     ierr = MatMultTransposeAdd_SeqSELL(A,xx,yy,yy);CHKERRQ(ierr);
726   }
727   PetscFunctionReturn(0);
728 }
729 
730 /*
731      Checks for missing diagonals
732 */
733 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool  *missing,PetscInt *d)
734 {
735   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
736   PetscInt       *diag,i;
737   PetscErrorCode ierr;
738 
739   PetscFunctionBegin;
740   *missing = PETSC_FALSE;
741   if (A->rmap->n > 0 && !(a->colidx)) {
742     *missing = PETSC_TRUE;
743     if (d) *d = 0;
744     ierr = PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n");CHKERRQ(ierr);
745   } else {
746     diag = a->diag;
747     for (i=0; i<A->rmap->n; i++) {
748       if (diag[i] == -1) {
749         *missing = PETSC_TRUE;
750         if (d) *d = i;
751         ierr = PetscInfo1(A,"Matrix is missing diagonal number %D\n",i);CHKERRQ(ierr);
752         break;
753       }
754     }
755   }
756   PetscFunctionReturn(0);
757 }
758 
759 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
760 {
761   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
762   PetscInt       i,j,m=A->rmap->n,shift;
763   PetscErrorCode ierr;
764 
765   PetscFunctionBegin;
766   if (!a->diag) {
767     ierr         = PetscMalloc1(m,&a->diag);CHKERRQ(ierr);
768     ierr         = PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt));CHKERRQ(ierr);
769     a->free_diag = PETSC_TRUE;
770   }
771   for (i=0; i<m; i++) { /* loop over rows */
772     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
773     a->diag[i] = -1;
774     for (j=0; j<a->rlen[i]; j++) {
775       if (a->colidx[shift+j*8] == i) {
776         a->diag[i] = shift+j*8;
777         break;
778       }
779     }
780   }
781   PetscFunctionReturn(0);
782 }
783 
784 /*
785   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
786 */
787 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
788 {
789   Mat_SeqSELL    *a=(Mat_SeqSELL*) A->data;
790   PetscInt       i,*diag,m = A->rmap->n;
791   MatScalar      *val = a->val;
792   PetscScalar    *idiag,*mdiag;
793   PetscErrorCode ierr;
794 
795   PetscFunctionBegin;
796   if (a->idiagvalid) PetscFunctionReturn(0);
797   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
798   diag = a->diag;
799   if (!a->idiag) {
800     ierr = PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work);CHKERRQ(ierr);
801     ierr = PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar));CHKERRQ(ierr);
802     val  = a->val;
803   }
804   mdiag = a->mdiag;
805   idiag = a->idiag;
806 
807   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
808     for (i=0; i<m; i++) {
809       mdiag[i] = val[diag[i]];
810       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
811         if (PetscRealPart(fshift)) {
812           ierr = PetscInfo1(A,"Zero diagonal on row %D\n",i);CHKERRQ(ierr);
813           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
814           A->factorerror_zeropivot_value = 0.0;
815           A->factorerror_zeropivot_row   = i;
816         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %D",i);
817       }
818       idiag[i] = 1.0/val[diag[i]];
819     }
820     ierr = PetscLogFlops(m);CHKERRQ(ierr);
821   } else {
822     for (i=0; i<m; i++) {
823       mdiag[i] = val[diag[i]];
824       idiag[i] = omega/(fshift + val[diag[i]]);
825     }
826     ierr = PetscLogFlops(2.0*m);CHKERRQ(ierr);
827   }
828   a->idiagvalid = PETSC_TRUE;
829   PetscFunctionReturn(0);
830 }
831 
832 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
833 {
834   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
835   PetscErrorCode ierr;
836 
837   PetscFunctionBegin;
838   ierr = PetscMemzero(a->val,(a->sliidx[a->totalslices])*sizeof(PetscScalar));CHKERRQ(ierr);
839   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
840   PetscFunctionReturn(0);
841 }
842 
843 PetscErrorCode MatDestroy_SeqSELL(Mat A)
844 {
845   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
846   PetscErrorCode ierr;
847 
848   PetscFunctionBegin;
849 #if defined(PETSC_USE_LOG)
850   PetscLogObjectState((PetscObject)A,"Rows=%D, Cols=%D, NZ=%D",A->rmap->n,A->cmap->n,a->nz);
851 #endif
852   ierr = MatSeqXSELLFreeSELL(A,&a->val,&a->colidx);CHKERRQ(ierr);
853   ierr = ISDestroy(&a->row);CHKERRQ(ierr);
854   ierr = ISDestroy(&a->col);CHKERRQ(ierr);
855   ierr = PetscFree(a->diag);CHKERRQ(ierr);
856   ierr = PetscFree(a->rlen);CHKERRQ(ierr);
857   ierr = PetscFree(a->sliidx);CHKERRQ(ierr);
858   ierr = PetscFree3(a->idiag,a->mdiag,a->ssor_work);CHKERRQ(ierr);
859   ierr = PetscFree(a->solve_work);CHKERRQ(ierr);
860   ierr = ISDestroy(&a->icol);CHKERRQ(ierr);
861   ierr = PetscFree(a->saved_values);CHKERRQ(ierr);
862   ierr = PetscFree2(a->getrowcols,a->getrowvals);CHKERRQ(ierr);
863 
864   ierr = PetscFree(A->data);CHKERRQ(ierr);
865 
866   ierr = PetscObjectChangeTypeName((PetscObject)A,0);CHKERRQ(ierr);
867   ierr = PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL);CHKERRQ(ierr);
868   ierr = PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL);CHKERRQ(ierr);
869 #if defined(PETSC_HAVE_ELEMENTAL)
870 #endif
871   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL);CHKERRQ(ierr);
872   PetscFunctionReturn(0);
873 }
874 
875 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
876 {
877   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
878   PetscErrorCode ierr;
879 
880   PetscFunctionBegin;
881   switch (op) {
882   case MAT_ROW_ORIENTED:
883     a->roworiented = flg;
884     break;
885   case MAT_KEEP_NONZERO_PATTERN:
886     a->keepnonzeropattern = flg;
887     break;
888   case MAT_NEW_NONZERO_LOCATIONS:
889     a->nonew = (flg ? 0 : 1);
890     break;
891   case MAT_NEW_NONZERO_LOCATION_ERR:
892     a->nonew = (flg ? -1 : 0);
893     break;
894   case MAT_NEW_NONZERO_ALLOCATION_ERR:
895     a->nonew = (flg ? -2 : 0);
896     break;
897   case MAT_UNUSED_NONZERO_LOCATION_ERR:
898     a->nounused = (flg ? -1 : 0);
899     break;
900   case MAT_NEW_DIAGONALS:
901   case MAT_IGNORE_OFF_PROC_ENTRIES:
902   case MAT_USE_HASH_TABLE:
903     ierr = PetscInfo1(A,"Option %s ignored\n",MatOptions[op]);CHKERRQ(ierr);
904     break;
905   case MAT_SPD:
906   case MAT_SYMMETRIC:
907   case MAT_STRUCTURALLY_SYMMETRIC:
908   case MAT_HERMITIAN:
909   case MAT_SYMMETRY_ETERNAL:
910     /* These options are handled directly by MatSetOption() */
911     break;
912   default:
913     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
914   }
915   PetscFunctionReturn(0);
916 }
917 
918 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
919 {
920   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
921   PetscInt       i,j,n,shift;
922   PetscScalar    *x,zero=0.0;
923   PetscErrorCode ierr;
924 
925   PetscFunctionBegin;
926   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
927   if (n != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
928 
929   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
930     PetscInt *diag=a->diag;
931     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
932     for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
933     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
934     PetscFunctionReturn(0);
935   }
936 
937   ierr = VecSet(v,zero);CHKERRQ(ierr);
938   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
939   for (i=0; i<n; i++) { /* loop over rows */
940     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
941     x[i] = 0;
942     for (j=0; j<a->rlen[i]; j++) {
943       if (a->colidx[shift+j*8] == i) {
944         x[i] = a->val[shift+j*8];
945         break;
946       }
947     }
948   }
949   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
950   PetscFunctionReturn(0);
951 }
952 
953 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
954 {
955   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
956   const PetscScalar *l,*r;
957   PetscInt          i,j,m,n,row;
958   PetscErrorCode    ierr;
959 
960   PetscFunctionBegin;
961   if (ll) {
962     /* The local size is used so that VecMPI can be passed to this routine
963        by MatDiagonalScale_MPISELL */
964     ierr = VecGetLocalSize(ll,&m);CHKERRQ(ierr);
965     if (m != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
966     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
967     for (i=0; i<a->totalslices; i++) { /* loop over slices */
968       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
969         a->val[j] *= l[8*i+row];
970       }
971     }
972     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
973     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
974   }
975   if (rr) {
976     ierr = VecGetLocalSize(rr,&n);CHKERRQ(ierr);
977     if (n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
978     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
979     for (i=0; i<a->totalslices; i++) { /* loop over slices */
980       for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
981         a->val[j] *= r[a->colidx[j]];
982       }
983     }
984     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
985     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
986   }
987   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
988   PetscFunctionReturn(0);
989 }
990 
991 extern PetscErrorCode MatSetValues_SeqSELL(Mat,PetscInt,const PetscInt[],PetscInt,const PetscInt[],const PetscScalar[],InsertMode);
992 
993 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
994 {
995   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
996   PetscInt    *cp,i,k,low,high,t,row,col,l;
997   PetscInt    shift;
998   MatScalar   *vp;
999 
1000   PetscFunctionBegin;
1001   for (k=0; k<m; k++) { /* loop over requested rows */
1002     row = im[k];
1003     if (row<0) continue;
1004 #if defined(PETSC_USE_DEBUG)
1005     if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1006 #endif
1007     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1008     cp = a->colidx+shift; /* pointer to the row */
1009     vp = a->val+shift; /* pointer to the row */
1010     for (l=0; l<n; l++) { /* loop over requested columns */
1011       col = in[l];
1012       if (col<0) continue;
1013 #if defined(PETSC_USE_DEBUG)
1014       if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %D max %D",col,A->cmap->n-1);
1015 #endif
1016       high = a->rlen[row]; low = 0; /* assume unsorted */
1017       while (high-low > 5) {
1018         t = (low+high)/2;
1019         if (*(cp+t*8) > col) high = t;
1020         else low = t;
1021       }
1022       for (i=low; i<high; i++) {
1023         if (*(cp+8*i) > col) break;
1024         if (*(cp+8*i) == col) {
1025           *v++ = *(vp+8*i);
1026           goto finished;
1027         }
1028       }
1029       *v++ = 0.0;
1030     finished:;
1031     }
1032   }
1033   PetscFunctionReturn(0);
1034 }
1035 
1036 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1037 {
1038   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1039   PetscInt          i,j,m=A->rmap->n,shift;
1040   const char        *name;
1041   PetscViewerFormat format;
1042   PetscErrorCode    ierr;
1043 
1044   PetscFunctionBegin;
1045   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1046   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1047     PetscInt nofinalvalue = 0;
1048     /*
1049     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1050       nofinalvalue = 1;
1051     }
1052     */
1053     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1054     ierr = PetscViewerASCIIPrintf(viewer,"%% Size = %D %D \n",m,A->cmap->n);CHKERRQ(ierr);
1055     ierr = PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %D \n",a->nz);CHKERRQ(ierr);
1056 #if defined(PETSC_USE_COMPLEX)
1057     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,4);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1058 #else
1059     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,3);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1060 #endif
1061     ierr = PetscViewerASCIIPrintf(viewer,"zzz = [\n");CHKERRQ(ierr);
1062 
1063     for (i=0; i<m; i++) {
1064       shift = a->sliidx[i>>3]+(i&0x07);
1065       for (j=0; j<a->rlen[i]; j++) {
1066 #if defined(PETSC_USE_COMPLEX)
1067         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1068 #else
1069         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1070 #endif
1071       }
1072     }
1073     /*
1074     if (nofinalvalue) {
1075 #if defined(PETSC_USE_COMPLEX)
1076       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",m,A->cmap->n,0.,0.);CHKERRQ(ierr);
1077 #else
1078       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",m,A->cmap->n,0.0);CHKERRQ(ierr);
1079 #endif
1080     }
1081     */
1082     ierr = PetscObjectGetName((PetscObject)A,&name);CHKERRQ(ierr);
1083     ierr = PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name);CHKERRQ(ierr);
1084     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1085   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1086     PetscFunctionReturn(0);
1087   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1088     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1089     for (i=0; i<m; i++) {
1090       ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1091       shift = a->sliidx[i>>3]+(i&0x07);
1092       for (j=0; j<a->rlen[i]; j++) {
1093 #if defined(PETSC_USE_COMPLEX)
1094         if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1095           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1096         } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1097           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1098         } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1099           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1100         }
1101 #else
1102         if (a->val[shift+8*j] != 0.0) {ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);}
1103 #endif
1104       }
1105       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1106     }
1107     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1108   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1109     PetscInt    cnt=0,jcnt;
1110     PetscScalar value;
1111 #if defined(PETSC_USE_COMPLEX)
1112     PetscBool   realonly=PETSC_TRUE;
1113     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1114       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1115         realonly = PETSC_FALSE;
1116         break;
1117       }
1118     }
1119 #endif
1120 
1121     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1122     for (i=0; i<m; i++) {
1123       jcnt = 0;
1124       shift = a->sliidx[i>>3]+(i&0x07);
1125       for (j=0; j<A->cmap->n; j++) {
1126         if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1127           value = a->val[cnt++];
1128           jcnt++;
1129         } else {
1130           value = 0.0;
1131         }
1132 #if defined(PETSC_USE_COMPLEX)
1133         if (realonly) {
1134           ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value));CHKERRQ(ierr);
1135         } else {
1136           ierr = PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value));CHKERRQ(ierr);
1137         }
1138 #else
1139         ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value);CHKERRQ(ierr);
1140 #endif
1141       }
1142       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1143     }
1144     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1145   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1146     PetscInt fshift=1;
1147     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1148 #if defined(PETSC_USE_COMPLEX)
1149     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n");CHKERRQ(ierr);
1150 #else
1151     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n");CHKERRQ(ierr);
1152 #endif
1153     ierr = PetscViewerASCIIPrintf(viewer,"%D %D %D\n", m, A->cmap->n, a->nz);CHKERRQ(ierr);
1154     for (i=0; i<m; i++) {
1155       shift = a->sliidx[i>>3]+(i&0x07);
1156       for (j=0; j<a->rlen[i]; j++) {
1157 #if defined(PETSC_USE_COMPLEX)
1158         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1159 #else
1160         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1161 #endif
1162       }
1163     }
1164     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1165   } else if (format == PETSC_VIEWER_NATIVE) {
1166     for (i=0; i<a->totalslices; i++) { /* loop over slices */
1167       PetscInt row;
1168       ierr = PetscViewerASCIIPrintf(viewer,"slice %D: %D %D\n",i,a->sliidx[i],a->sliidx[i+1]);CHKERRQ(ierr);
1169       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1170 #if defined(PETSC_USE_COMPLEX)
1171         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1172           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1173         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1174           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1175         } else {
1176           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1177         }
1178 #else
1179         ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1180 #endif
1181       }
1182     }
1183   } else {
1184     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1185     if (A->factortype) {
1186       for (i=0; i<m; i++) {
1187         shift = a->sliidx[i>>3]+(i&0x07);
1188         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1189         /* L part */
1190         for (j=shift; j<a->diag[i]; j+=8) {
1191 #if defined(PETSC_USE_COMPLEX)
1192           if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1193             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1194           } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1195             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1196           } else {
1197             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1198           }
1199 #else
1200           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1201 #endif
1202         }
1203         /* diagonal */
1204         j = a->diag[i];
1205 #if defined(PETSC_USE_COMPLEX)
1206         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1207           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]));CHKERRQ(ierr);
1208         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1209           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])));CHKERRQ(ierr);
1210         } else {
1211           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]));CHKERRQ(ierr);
1212         }
1213 #else
1214         ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)(1.0/a->val[j]));CHKERRQ(ierr);
1215 #endif
1216 
1217         /* U part */
1218         for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1219 #if defined(PETSC_USE_COMPLEX)
1220           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1221             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1222           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1223             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1224           } else {
1225             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1226           }
1227 #else
1228           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1229 #endif
1230         }
1231         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1232       }
1233     } else {
1234       for (i=0; i<m; i++) {
1235         shift = a->sliidx[i>>3]+(i&0x07);
1236         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1237         for (j=0; j<a->rlen[i]; j++) {
1238 #if defined(PETSC_USE_COMPLEX)
1239           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1240             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1241           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1242             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1243           } else {
1244             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1245           }
1246 #else
1247           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);
1248 #endif
1249         }
1250         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1251       }
1252     }
1253     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1254   }
1255   ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
1256   PetscFunctionReturn(0);
1257 }
1258 
1259 #include <petscdraw.h>
1260 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1261 {
1262   Mat               A=(Mat)Aa;
1263   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1264   PetscInt          i,j,m=A->rmap->n,shift;
1265   int               color;
1266   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1267   PetscViewer       viewer;
1268   PetscViewerFormat format;
1269   PetscErrorCode    ierr;
1270 
1271   PetscFunctionBegin;
1272   ierr = PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer);CHKERRQ(ierr);
1273   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1274   ierr = PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr);CHKERRQ(ierr);
1275 
1276   /* loop over matrix elements drawing boxes */
1277 
1278   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1279     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1280     /* Blue for negative, Cyan for zero and  Red for positive */
1281     color = PETSC_DRAW_BLUE;
1282     for (i=0; i<m; i++) {
1283       shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1284       y_l = m - i - 1.0; y_r = y_l + 1.0;
1285       for (j=0; j<a->rlen[i]; j++) {
1286         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1287         if (PetscRealPart(a->val[shift+8*j]) >=  0.) continue;
1288         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1289       }
1290     }
1291     color = PETSC_DRAW_CYAN;
1292     for (i=0; i<m; i++) {
1293       shift = a->sliidx[i>>3]+(i&0x07);
1294       y_l = m - i - 1.0; y_r = y_l + 1.0;
1295       for (j=0; j<a->rlen[i]; j++) {
1296         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1297         if (a->val[shift+8*j] !=  0.) continue;
1298         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1299       }
1300     }
1301     color = PETSC_DRAW_RED;
1302     for (i=0; i<m; i++) {
1303       shift = a->sliidx[i>>3]+(i&0x07);
1304       y_l = m - i - 1.0; y_r = y_l + 1.0;
1305       for (j=0; j<a->rlen[i]; j++) {
1306         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1307         if (PetscRealPart(a->val[shift+8*j]) <=  0.) continue;
1308         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1309       }
1310     }
1311     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1312   } else {
1313     /* use contour shading to indicate magnitude of values */
1314     /* first determine max of all nonzero values */
1315     PetscReal minv=0.0,maxv=0.0;
1316     PetscInt  count=0;
1317     PetscDraw popup;
1318     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1319       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1320     }
1321     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1322     ierr = PetscDrawGetPopup(draw,&popup);CHKERRQ(ierr);
1323     ierr = PetscDrawScalePopup(popup,minv,maxv);CHKERRQ(ierr);
1324 
1325     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1326     for (i=0; i<m; i++) {
1327       shift = a->sliidx[i>>3]+(i&0x07);
1328       y_l = m - i - 1.0;
1329       y_r = y_l + 1.0;
1330       for (j=0; j<a->rlen[i]; j++) {
1331         x_l = a->colidx[shift+j*8];
1332         x_r = x_l + 1.0;
1333         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1334         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1335         count++;
1336       }
1337     }
1338     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1339   }
1340   PetscFunctionReturn(0);
1341 }
1342 
1343 #include <petscdraw.h>
1344 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1345 {
1346   PetscDraw      draw;
1347   PetscReal      xr,yr,xl,yl,h,w;
1348   PetscBool      isnull;
1349   PetscErrorCode ierr;
1350 
1351   PetscFunctionBegin;
1352   ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
1353   ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr);
1354   if (isnull) PetscFunctionReturn(0);
1355 
1356   xr   = A->cmap->n; yr  = A->rmap->n; h = yr/10.0; w = xr/10.0;
1357   xr  += w;          yr += h;         xl = -w;     yl = -h;
1358   ierr = PetscDrawSetCoordinates(draw,xl,yl,xr,yr);CHKERRQ(ierr);
1359   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer);CHKERRQ(ierr);
1360   ierr = PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A);CHKERRQ(ierr);
1361   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL);CHKERRQ(ierr);
1362   ierr = PetscDrawSave(draw);CHKERRQ(ierr);
1363   PetscFunctionReturn(0);
1364 }
1365 
1366 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1367 {
1368   PetscBool      iascii,isbinary,isdraw;
1369   PetscErrorCode ierr;
1370 
1371   PetscFunctionBegin;
1372   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
1373   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary);CHKERRQ(ierr);
1374   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw);CHKERRQ(ierr);
1375   if (iascii) {
1376     ierr = MatView_SeqSELL_ASCII(A,viewer);CHKERRQ(ierr);
1377   } else if (isbinary) {
1378     /* ierr = MatView_SeqSELL_Binary(A,viewer);CHKERRQ(ierr); */
1379   } else if (isdraw) {
1380     ierr = MatView_SeqSELL_Draw(A,viewer);CHKERRQ(ierr);
1381   }
1382   PetscFunctionReturn(0);
1383 }
1384 
1385 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1386 {
1387   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1388   PetscInt       i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1389   MatScalar      *vp;
1390   PetscErrorCode ierr;
1391 
1392   PetscFunctionBegin;
1393   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1394   /* To do: compress out the unused elements */
1395   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
1396   ierr = PetscInfo6(A,"Matrix size: %D X %D; storage space: %D allocated %D used (%D nonzeros+%D paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz);CHKERRQ(ierr);
1397   ierr = PetscInfo1(A,"Number of mallocs during MatSetValues() is %D\n",a->reallocs);CHKERRQ(ierr);
1398   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rlenmax);CHKERRQ(ierr);
1399   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1400   for (i=0; i<a->totalslices; ++i) {
1401     shift = a->sliidx[i];    /* starting index of the slice */
1402     cp    = a->colidx+shift; /* pointer to the column indices of the slice */
1403     vp    = a->val+shift;    /* pointer to the nonzero values of the slice */
1404     for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1405       row  = 8*i + row_in_slice;
1406       nrow = a->rlen[row]; /* number of nonzeros in row */
1407       /*
1408         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1409         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1410       */
1411       lastcol = 0;
1412       if (nrow>0) { /* nonempty row */
1413         lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1414       } else if (!row_in_slice) { /* first row of the currect slice is empty */
1415         for (j=1;j<8;j++) {
1416           if (a->rlen[8*i+j]) {
1417             lastcol = cp[j];
1418             break;
1419           }
1420         }
1421       } else {
1422         if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1423       }
1424 
1425       for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1426         cp[8*k+row_in_slice] = lastcol;
1427         vp[8*k+row_in_slice] = (MatScalar)0;
1428       }
1429     }
1430   }
1431 
1432   A->info.mallocs += a->reallocs;
1433   a->reallocs      = 0;
1434 
1435   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1436   PetscFunctionReturn(0);
1437 }
1438 
1439 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1440 {
1441   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1442 
1443   PetscFunctionBegin;
1444   info->block_size   = 1.0;
1445   info->nz_allocated = (double)a->maxallocmat;
1446   info->nz_used      = (double)a->sliidx[a->totalslices]; /* include padding zeros */
1447   info->nz_unneeded  = (double)(a->maxallocmat-a->sliidx[a->totalslices]);
1448   info->assemblies   = (double)A->num_ass;
1449   info->mallocs      = (double)A->info.mallocs;
1450   info->memory       = ((PetscObject)A)->mem;
1451   if (A->factortype) {
1452     info->fill_ratio_given  = A->info.fill_ratio_given;
1453     info->fill_ratio_needed = A->info.fill_ratio_needed;
1454     info->factor_mallocs    = A->info.factor_mallocs;
1455   } else {
1456     info->fill_ratio_given  = 0;
1457     info->fill_ratio_needed = 0;
1458     info->factor_mallocs    = 0;
1459   }
1460   PetscFunctionReturn(0);
1461 }
1462 
1463 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1464 {
1465   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1466   PetscInt       shift,i,k,l,low,high,t,ii,row,col,nrow;
1467   PetscInt       *cp,nonew=a->nonew,lastcol=-1;
1468   MatScalar      *vp,value;
1469   PetscErrorCode ierr;
1470 
1471   PetscFunctionBegin;
1472   for (k=0; k<m; k++) { /* loop over added rows */
1473     row = im[k];
1474     if (row < 0) continue;
1475 #if defined(PETSC_USE_DEBUG)
1476     if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1477 #endif
1478     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1479     cp    = a->colidx+shift; /* pointer to the row */
1480     vp    = a->val+shift; /* pointer to the row */
1481     nrow  = a->rlen[row];
1482     low   = 0;
1483     high  = nrow;
1484 
1485     for (l=0; l<n; l++) { /* loop over added columns */
1486       col = in[l];
1487       if (col<0) continue;
1488 #if defined(PETSC_USE_DEBUG)
1489       if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %D max %D",col,A->cmap->n-1);
1490 #endif
1491       if (a->roworiented) {
1492         value = v[l+k*n];
1493       } else {
1494         value = v[k+l*m];
1495       }
1496       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1497 
1498       /* search in this row for the specified colmun, i indicates the column to be set */
1499       if (col <= lastcol) low = 0;
1500       else high = nrow;
1501       lastcol = col;
1502       while (high-low > 5) {
1503         t = (low+high)/2;
1504         if (*(cp+t*8) > col) high = t;
1505         else low = t;
1506       }
1507       for (i=low; i<high; i++) {
1508         if (*(cp+i*8) > col) break;
1509         if (*(cp+i*8) == col) {
1510           if (is == ADD_VALUES) *(vp+i*8) += value;
1511           else *(vp+i*8) = value;
1512           low = i + 1;
1513           goto noinsert;
1514         }
1515       }
1516       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1517       if (nonew == 1) goto noinsert;
1518       if (nonew == -1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%D, %D) in the matrix", row, col);
1519       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1520       MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1521       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1522       for (ii=nrow-1; ii>=i; ii--) {
1523         *(cp+(ii+1)*8) = *(cp+ii*8);
1524         *(vp+(ii+1)*8) = *(vp+ii*8);
1525       }
1526       a->rlen[row]++;
1527       *(cp+i*8) = col;
1528       *(vp+i*8) = value;
1529       a->nz++;
1530       A->nonzerostate++;
1531       low = i+1; high++; nrow++;
1532 noinsert:;
1533     }
1534     a->rlen[row] = nrow;
1535   }
1536   PetscFunctionReturn(0);
1537 }
1538 
1539 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1540 {
1541   PetscErrorCode ierr;
1542 
1543   PetscFunctionBegin;
1544   /* If the two matrices have the same copy implementation, use fast copy. */
1545   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1546     Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1547     Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;
1548 
1549     if (a->sliidx[a->totalslices] != b->sliidx[b->totalslices]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different");
1550     ierr = PetscMemcpy(b->val,a->val,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1551   } else {
1552     ierr = MatCopy_Basic(A,B,str);CHKERRQ(ierr);
1553   }
1554   PetscFunctionReturn(0);
1555 }
1556 
1557 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1558 {
1559   PetscErrorCode ierr;
1560 
1561   PetscFunctionBegin;
1562   ierr = MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,0);CHKERRQ(ierr);
1563   PetscFunctionReturn(0);
1564 }
1565 
1566 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1567 {
1568   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1569 
1570   PetscFunctionBegin;
1571   *array = a->val;
1572   PetscFunctionReturn(0);
1573 }
1574 
1575 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1576 {
1577   PetscFunctionBegin;
1578   PetscFunctionReturn(0);
1579 }
1580 
1581 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1582 {
1583   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1584   PetscInt    i;
1585   MatScalar   *aval=a->val;
1586 
1587   PetscFunctionBegin;
1588   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1589   PetscFunctionReturn(0);
1590 }
1591 
1592 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1593 {
1594   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1595   PetscInt       i;
1596   MatScalar      *aval=a->val;
1597   PetscErrorCode ierr;
1598 
1599   PetscFunctionBegin;
1600   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1601   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1602   PetscFunctionReturn(0);
1603 }
1604 
1605 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1606 {
1607   Mat_SeqSELL    *a=(Mat_SeqSELL*)inA->data;
1608   MatScalar      *aval=a->val;
1609   PetscScalar    oalpha=alpha;
1610   PetscBLASInt   one=1,size;
1611   PetscErrorCode ierr;
1612 
1613   PetscFunctionBegin;
1614   ierr = PetscBLASIntCast(a->sliidx[a->totalslices],&size);CHKERRQ(ierr);
1615   PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1616   ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1617   ierr = MatSeqSELLInvalidateDiagonal(inA);CHKERRQ(ierr);
1618   PetscFunctionReturn(0);
1619 }
1620 
1621 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1622 {
1623   Mat_SeqSELL    *y=(Mat_SeqSELL*)Y->data;
1624   PetscErrorCode ierr;
1625 
1626   PetscFunctionBegin;
1627   if (!Y->preallocated || !y->nz) {
1628     ierr = MatSeqSELLSetPreallocation(Y,1,NULL);CHKERRQ(ierr);
1629   }
1630   ierr = MatShift_Basic(Y,a);CHKERRQ(ierr);
1631   PetscFunctionReturn(0);
1632 }
1633 
1634 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1635 {
1636   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1637   PetscScalar       *x,sum,*t;
1638   const MatScalar   *idiag=0,*mdiag;
1639   const PetscScalar *b,*xb;
1640   PetscInt          n,m=A->rmap->n,i,j,shift;
1641   const PetscInt    *diag;
1642   PetscErrorCode    ierr;
1643 
1644   PetscFunctionBegin;
1645   its = its*lits;
1646 
1647   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1648   if (!a->idiagvalid) {ierr = MatInvertDiagonal_SeqSELL(A,omega,fshift);CHKERRQ(ierr);}
1649   a->fshift = fshift;
1650   a->omega  = omega;
1651 
1652   diag  = a->diag;
1653   t     = a->ssor_work;
1654   idiag = a->idiag;
1655   mdiag = a->mdiag;
1656 
1657   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1658   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1659   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1660   if (flag == SOR_APPLY_UPPER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented");
1661   if (flag == SOR_APPLY_LOWER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented");
1662   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
1663 
1664   if (flag & SOR_ZERO_INITIAL_GUESS) {
1665     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1666       for (i=0; i<m; i++) {
1667         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1668         sum   = b[i];
1669         n     = (diag[i]-shift)/8;
1670         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1671         t[i]  = sum;
1672         x[i]  = sum*idiag[i];
1673       }
1674       xb   = t;
1675       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1676     } else xb = b;
1677     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1678       for (i=m-1; i>=0; i--) {
1679         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1680         sum   = xb[i];
1681         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1682         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1683         if (xb == b) {
1684           x[i] = sum*idiag[i];
1685         } else {
1686           x[i] = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1687         }
1688       }
1689       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1690     }
1691     its--;
1692   }
1693   while (its--) {
1694     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1695       for (i=0; i<m; i++) {
1696         /* lower */
1697         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1698         sum   = b[i];
1699         n     = (diag[i]-shift)/8;
1700         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1701         t[i]  = sum;             /* save application of the lower-triangular part */
1702         /* upper */
1703         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1704         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1705         x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1706       }
1707       xb   = t;
1708       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1709     } else xb = b;
1710     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1711       for (i=m-1; i>=0; i--) {
1712         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1713         sum = xb[i];
1714         if (xb == b) {
1715           /* whole matrix (no checkpointing available) */
1716           n     = a->rlen[i];
1717           for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1718           x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1719         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1720           n     = a->rlen[i]-(diag[i]-shift)/8-1;
1721           for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1722           x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1723         }
1724       }
1725       if (xb == b) {
1726         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1727       } else {
1728         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1729       }
1730     }
1731   }
1732   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1733   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1734   PetscFunctionReturn(0);
1735 }
1736 
1737 /* -------------------------------------------------------------------*/
1738 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1739                                        MatGetRow_SeqSELL,
1740                                        MatRestoreRow_SeqSELL,
1741                                        MatMult_SeqSELL,
1742                                /* 4*/  MatMultAdd_SeqSELL,
1743                                        MatMultTranspose_SeqSELL,
1744                                        MatMultTransposeAdd_SeqSELL,
1745                                        0,
1746                                        0,
1747                                        0,
1748                                /* 10*/ 0,
1749                                        0,
1750                                        0,
1751                                        MatSOR_SeqSELL,
1752                                        0,
1753                                /* 15*/ MatGetInfo_SeqSELL,
1754                                        MatEqual_SeqSELL,
1755                                        MatGetDiagonal_SeqSELL,
1756                                        MatDiagonalScale_SeqSELL,
1757                                        0,
1758                                /* 20*/ 0,
1759                                        MatAssemblyEnd_SeqSELL,
1760                                        MatSetOption_SeqSELL,
1761                                        MatZeroEntries_SeqSELL,
1762                                /* 24*/ 0,
1763                                        0,
1764                                        0,
1765                                        0,
1766                                        0,
1767                                /* 29*/ MatSetUp_SeqSELL,
1768                                        0,
1769                                        0,
1770                                        0,
1771                                        0,
1772                                /* 34*/ MatDuplicate_SeqSELL,
1773                                        0,
1774                                        0,
1775                                        0,
1776                                        0,
1777                                /* 39*/ 0,
1778                                        0,
1779                                        0,
1780                                        MatGetValues_SeqSELL,
1781                                        MatCopy_SeqSELL,
1782                                /* 44*/ 0,
1783                                        MatScale_SeqSELL,
1784                                        MatShift_SeqSELL,
1785                                        0,
1786                                        0,
1787                                /* 49*/ 0,
1788                                        0,
1789                                        0,
1790                                        0,
1791                                        0,
1792                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
1793                                        0,
1794                                        0,
1795                                        0,
1796                                        0,
1797                                /* 59*/ 0,
1798                                        MatDestroy_SeqSELL,
1799                                        MatView_SeqSELL,
1800                                        0,
1801                                        0,
1802                                /* 64*/ 0,
1803                                        0,
1804                                        0,
1805                                        0,
1806                                        0,
1807                                /* 69*/ 0,
1808                                        0,
1809                                        0,
1810                                        0,
1811                                        0,
1812                                /* 74*/ 0,
1813                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1814                                        0,
1815                                        0,
1816                                        0,
1817                                /* 79*/ 0,
1818                                        0,
1819                                        0,
1820                                        0,
1821                                        0,
1822                                /* 84*/ 0,
1823                                        0,
1824                                        0,
1825                                        0,
1826                                        0,
1827                                /* 89*/ 0,
1828                                        0,
1829                                        0,
1830                                        0,
1831                                        0,
1832                                /* 94*/ 0,
1833                                        0,
1834                                        0,
1835                                        0,
1836                                        0,
1837                                /* 99*/ 0,
1838                                        0,
1839                                        0,
1840                                        MatConjugate_SeqSELL,
1841                                        0,
1842                                /*104*/ 0,
1843                                        0,
1844                                        0,
1845                                        0,
1846                                        0,
1847                                /*109*/ 0,
1848                                        0,
1849                                        0,
1850                                        0,
1851                                        MatMissingDiagonal_SeqSELL,
1852                                /*114*/ 0,
1853                                        0,
1854                                        0,
1855                                        0,
1856                                        0,
1857                                /*119*/ 0,
1858                                        0,
1859                                        0,
1860                                        0,
1861                                        0,
1862                                /*124*/ 0,
1863                                        0,
1864                                        0,
1865                                        0,
1866                                        0,
1867                                /*129*/ 0,
1868                                        0,
1869                                        0,
1870                                        0,
1871                                        0,
1872                                /*134*/ 0,
1873                                        0,
1874                                        0,
1875                                        0,
1876                                        0,
1877                                /*139*/ 0,
1878                                        0,
1879                                        0,
1880                                        MatFDColoringSetUp_SeqXAIJ,
1881                                        0,
1882                                 /*144*/0
1883 };
1884 
1885 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1886 {
1887   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1888   PetscErrorCode ierr;
1889 
1890   PetscFunctionBegin;
1891   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1892 
1893   /* allocate space for values if not already there */
1894   if (!a->saved_values) {
1895     ierr = PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values);CHKERRQ(ierr);
1896     ierr = PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1897   }
1898 
1899   /* copy values over */
1900   ierr = PetscMemcpy(a->saved_values,a->val,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1901   PetscFunctionReturn(0);
1902 }
1903 
1904 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1905 {
1906   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1907   PetscErrorCode ierr;
1908 
1909   PetscFunctionBegin;
1910   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1911   if (!a->saved_values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first");
1912   /* copy values over */
1913   ierr = PetscMemcpy(a->val,a->saved_values,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1914   PetscFunctionReturn(0);
1915 }
1916 
1917 /*@C
1918  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1919 
1920  Not Collective
1921 
1922  Input Parameters:
1923  .  mat - a MATSEQSELL matrix
1924  .  array - pointer to the data
1925 
1926  Level: intermediate
1927 
1928  .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90()
1929  @*/
1930 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1931 {
1932   PetscErrorCode ierr;
1933 
1934   PetscFunctionBegin;
1935   ierr = PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));CHKERRQ(ierr);
1936   PetscFunctionReturn(0);
1937 }
1938 
1939 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1940 {
1941   Mat_SeqSELL    *b;
1942   PetscMPIInt    size;
1943   PetscErrorCode ierr;
1944 
1945   PetscFunctionBegin;
1946   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr);
1947   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1");
1948 
1949   ierr = PetscNewLog(B,&b);CHKERRQ(ierr);
1950 
1951   B->data = (void*)b;
1952 
1953   ierr = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1954 
1955   b->row                = 0;
1956   b->col                = 0;
1957   b->icol               = 0;
1958   b->reallocs           = 0;
1959   b->ignorezeroentries  = PETSC_FALSE;
1960   b->roworiented        = PETSC_TRUE;
1961   b->nonew              = 0;
1962   b->diag               = 0;
1963   b->solve_work         = 0;
1964   B->spptr              = 0;
1965   b->saved_values       = 0;
1966   b->idiag              = 0;
1967   b->mdiag              = 0;
1968   b->ssor_work          = 0;
1969   b->omega              = 1.0;
1970   b->fshift             = 0.0;
1971   b->idiagvalid         = PETSC_FALSE;
1972   b->keepnonzeropattern = PETSC_FALSE;
1973 
1974   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL);CHKERRQ(ierr);
1975   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL);CHKERRQ(ierr);
1976   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL);CHKERRQ(ierr);
1977   ierr = PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL);CHKERRQ(ierr);
1978   ierr = PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL);CHKERRQ(ierr);
1979   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL);CHKERRQ(ierr);
1980   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ);CHKERRQ(ierr);
1981   PetscFunctionReturn(0);
1982 }
1983 
1984 /*
1985  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1986  */
1987 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
1988 {
1989   Mat_SeqSELL    *c,*a=(Mat_SeqSELL*)A->data;
1990   PetscInt       i,m=A->rmap->n;
1991   PetscInt       totalslices=a->totalslices;
1992   PetscErrorCode ierr;
1993 
1994   PetscFunctionBegin;
1995   c = (Mat_SeqSELL*)C->data;
1996 
1997   C->factortype = A->factortype;
1998   c->row        = 0;
1999   c->col        = 0;
2000   c->icol       = 0;
2001   c->reallocs   = 0;
2002 
2003   C->assembled = PETSC_TRUE;
2004 
2005   ierr = PetscLayoutReference(A->rmap,&C->rmap);CHKERRQ(ierr);
2006   ierr = PetscLayoutReference(A->cmap,&C->cmap);CHKERRQ(ierr);
2007 
2008   ierr = PetscMalloc1(8*totalslices,&c->rlen);CHKERRQ(ierr);
2009   ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2010   ierr = PetscMalloc1(totalslices+1,&c->sliidx);CHKERRQ(ierr);
2011   ierr = PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
2012 
2013   for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
2014   for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];
2015 
2016   /* allocate the matrix space */
2017   if (mallocmatspace) {
2018     ierr = PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx);CHKERRQ(ierr);
2019     ierr = PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
2020 
2021     c->singlemalloc = PETSC_TRUE;
2022 
2023     if (m > 0) {
2024       ierr = PetscMemcpy(c->colidx,a->colidx,(a->maxallocmat)*sizeof(PetscInt));CHKERRQ(ierr);
2025       if (cpvalues == MAT_COPY_VALUES) {
2026         ierr = PetscMemcpy(c->val,a->val,a->maxallocmat*sizeof(PetscScalar));CHKERRQ(ierr);
2027       } else {
2028         ierr = PetscMemzero(c->val,a->maxallocmat*sizeof(PetscScalar));CHKERRQ(ierr);
2029       }
2030     }
2031   }
2032 
2033   c->ignorezeroentries = a->ignorezeroentries;
2034   c->roworiented       = a->roworiented;
2035   c->nonew             = a->nonew;
2036   if (a->diag) {
2037     ierr = PetscMalloc1(m,&c->diag);CHKERRQ(ierr);
2038     ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2039     for (i=0; i<m; i++) {
2040       c->diag[i] = a->diag[i];
2041     }
2042   } else c->diag = 0;
2043 
2044   c->solve_work         = 0;
2045   c->saved_values       = 0;
2046   c->idiag              = 0;
2047   c->ssor_work          = 0;
2048   c->keepnonzeropattern = a->keepnonzeropattern;
2049   c->free_val           = PETSC_TRUE;
2050   c->free_colidx        = PETSC_TRUE;
2051 
2052   c->maxallocmat  = a->maxallocmat;
2053   c->maxallocrow  = a->maxallocrow;
2054   c->rlenmax      = a->rlenmax;
2055   c->nz           = a->nz;
2056   C->preallocated = PETSC_TRUE;
2057 
2058   c->nonzerorowcnt = a->nonzerorowcnt;
2059   C->nonzerostate  = A->nonzerostate;
2060 
2061   ierr = PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist);CHKERRQ(ierr);
2062   PetscFunctionReturn(0);
2063 }
2064 
2065 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2066 {
2067   PetscErrorCode ierr;
2068 
2069   PetscFunctionBegin;
2070   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
2071   ierr = MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);CHKERRQ(ierr);
2072   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2073     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
2074   }
2075   ierr = MatSetType(*B,((PetscObject)A)->type_name);CHKERRQ(ierr);
2076   ierr = MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE);CHKERRQ(ierr);
2077   PetscFunctionReturn(0);
2078 }
2079 
2080 /*@C
2081  MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2082 
2083  Collective on MPI_Comm
2084 
2085  Input Parameters:
2086  +  comm - MPI communicator, set to PETSC_COMM_SELF
2087  .  m - number of rows
2088  .  n - number of columns
2089  .  rlenmax - maximum number of nonzeros in a row
2090  -  rlen - array containing the number of nonzeros in the various rows
2091  (possibly different for each row) or NULL
2092 
2093  Output Parameter:
2094  .  A - the matrix
2095 
2096  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2097  MatXXXXSetPreallocation() paradgm instead of this routine directly.
2098  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2099 
2100  Notes:
2101  If nnz is given then nz is ignored
2102 
2103  Specify the preallocated storage with either rlenmax or rlen (not both).
2104  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2105  allocation.  For large problems you MUST preallocate memory or you
2106  will get TERRIBLE performance, see the users' manual chapter on matrices.
2107 
2108  Level: intermediate
2109 
2110  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatCreateSeqSELLWithArrays()
2111 
2112  @*/
2113 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2114 {
2115   PetscErrorCode ierr;
2116 
2117   PetscFunctionBegin;
2118   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2119   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2120   ierr = MatSetType(*A,MATSEQSELL);CHKERRQ(ierr);
2121   ierr = MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen);CHKERRQ(ierr);
2122   PetscFunctionReturn(0);
2123 }
2124 
2125 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2126 {
2127   Mat_SeqSELL     *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2128   PetscInt       totalslices=a->totalslices;
2129   PetscErrorCode ierr;
2130 
2131   PetscFunctionBegin;
2132   /* If the  matrix dimensions are not equal,or no of nonzeros */
2133   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2134     *flg = PETSC_FALSE;
2135     PetscFunctionReturn(0);
2136   }
2137   /* if the a->colidx are the same */
2138   ierr = PetscMemcmp(a->colidx,b->colidx,a->sliidx[totalslices]*sizeof(PetscInt),flg);CHKERRQ(ierr);
2139   if (!*flg) PetscFunctionReturn(0);
2140   /* if a->val are the same */
2141   ierr = PetscMemcmp(a->val,b->val,a->sliidx[totalslices]*sizeof(PetscScalar),flg);CHKERRQ(ierr);
2142   PetscFunctionReturn(0);
2143 }
2144 
2145 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2146 {
2147   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2148 
2149   PetscFunctionBegin;
2150   a->idiagvalid  = PETSC_FALSE;
2151   PetscFunctionReturn(0);
2152 }
2153 
2154 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2155 {
2156 #if defined(PETSC_USE_COMPLEX)
2157   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2158   PetscInt    i;
2159   PetscScalar *val = a->val;
2160 
2161   PetscFunctionBegin;
2162   for (i=0; i<a->sliidx[a->totalslices]; i++) {
2163     val[i] = PetscConj(val[i]);
2164   }
2165 #else
2166   PetscFunctionBegin;
2167 #endif
2168   PetscFunctionReturn(0);
2169 }
2170