xref: /petsc/src/mat/impls/sell/seq/sell.c (revision 715b75585fd277b048903486c3bbde8d0b0c8e8b)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h>  /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
9 
10   #include <immintrin.h>
11 
12   #if !defined(_MM_SCALE_8)
13   #define _MM_SCALE_8    8
14   #endif
15 
16   #if defined(__AVX512F__)
17   /* these do not work
18    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
19    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
20   */
21     #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
22     /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
23     vec_idx  = _mm256_loadu_si256((__m256i const*)acolidx); \
24     vec_vals = _mm512_loadu_pd(aval); \
25     vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
26     vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
27   #elif defined(__AVX2__) && defined(__FMA__)
28     #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
29     vec_vals = _mm256_loadu_pd(aval); \
30     vec_idx  = _mm_loadu_si128((__m128i const*)acolidx); /* SSE2 */ \
31     vec_x    = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
32     vec_y    = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
33   #endif
34 #endif  /* PETSC_HAVE_IMMINTRIN_H */
35 
36 /*@C
37  MatSeqSELLSetPreallocation - For good matrix assembly performance
38  the user should preallocate the matrix storage by setting the parameter nz
39  (or the array nnz).  By setting these parameters accurately, performance
40  during matrix assembly can be increased significantly.
41 
42  Collective on MPI_Comm
43 
44  Input Parameters:
45  +  B - The matrix
46  .  nz - number of nonzeros per row (same for all rows)
47  -  nnz - array containing the number of nonzeros in the various rows
48  (possibly different for each row) or NULL
49 
50  Notes:
51  If nnz is given then nz is ignored.
52 
53  Specify the preallocated storage with either nz or nnz (not both).
54  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
55  allocation.  For large problems you MUST preallocate memory or you
56  will get TERRIBLE performance, see the users' manual chapter on matrices.
57 
58  You can call MatGetInfo() to get information on how effective the preallocation was;
59  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
60  You can also run with the option -info and look for messages with the string
61  malloc in them to see if additional memory allocation was needed.
62 
63  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
64  entries or columns indices.
65 
66  The maximum number of nonzeos in any row should be as accuate as possible.
67  If it is underesitmated, you will get bad performance due to reallocation
68  (MatSeqXSELLReallocateSELL).
69 
70  Level: intermediate
71 
72  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo()
73 
74  @*/
75 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
76 {
77   PetscErrorCode ierr;
78 
79   PetscFunctionBegin;
80   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
81   PetscValidType(B,1);
82   ierr = PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));CHKERRQ(ierr);
83   PetscFunctionReturn(0);
84 }
85 
86 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
87 {
88   Mat_SeqSELL    *b;
89   PetscInt       i,j,totalslices;
90   PetscBool      skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;
91   PetscErrorCode ierr;
92 
93   PetscFunctionBegin;
94   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
95   if (maxallocrow == MAT_SKIP_ALLOCATION) {
96     skipallocation = PETSC_TRUE;
97     maxallocrow    = 0;
98   }
99 
100   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
101   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
102 
103   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
104   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
105   if (maxallocrow < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %D",maxallocrow);
106   if (rlen) {
107     for (i=0; i<B->rmap->n; i++) {
108       if (rlen[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %D value %D",i,rlen[i]);
109       if (rlen[i] > B->cmap->n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %D value %D rowlength %D",i,rlen[i],B->cmap->n);
110     }
111   }
112 
113   B->preallocated = PETSC_TRUE;
114 
115   b = (Mat_SeqSELL*)B->data;
116 
117   totalslices = B->rmap->n/8+((B->rmap->n & 0x07)?1:0); /* ceil(n/8) */
118   b->totalslices = totalslices;
119   if (!skipallocation) {
120     if (B->rmap->n & 0x07) PetscInfo1(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %D)\n",B->rmap->n);
121 
122     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
123       ierr = PetscMalloc1(totalslices+1,&b->sliidx);CHKERRQ(ierr);
124       ierr = PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
125     }
126     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
127       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
128       else if (maxallocrow < 0) maxallocrow = 1;
129       for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
130     } else {
131       maxallocrow = 0;
132       b->sliidx[0] = 0;
133       for (i=1; i<totalslices; i++) {
134         b->sliidx[i] = 0;
135         for (j=0;j<8;j++) {
136           b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
137         }
138         maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
139         b->sliidx[i] = b->sliidx[i-1] + 8*b->sliidx[i];
140       }
141       /* last slice */
142       b->sliidx[totalslices] = 0;
143       for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
144       maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
145       b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
146     }
147 
148     /* allocate space for val, colidx, rlen */
149     /* FIXME: should B's old memory be unlogged? */
150     ierr = MatSeqXSELLFreeSELL(B,&b->val,&b->colidx);CHKERRQ(ierr);
151     /* FIXME: assuming an element of the bit array takes 8 bits */
152     ierr = PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx);CHKERRQ(ierr);
153     ierr = PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
154     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
155     ierr = PetscCalloc1(8*totalslices,&b->rlen);CHKERRQ(ierr);
156     ierr = PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt));CHKERRQ(ierr);
157 
158     b->singlemalloc = PETSC_TRUE;
159     b->free_val     = PETSC_TRUE;
160     b->free_colidx  = PETSC_TRUE;
161   } else {
162     b->free_val    = PETSC_FALSE;
163     b->free_colidx = PETSC_FALSE;
164   }
165 
166   b->nz               = 0;
167   b->maxallocrow      = maxallocrow;
168   b->rlenmax          = maxallocrow;
169   b->maxallocmat      = b->sliidx[totalslices];
170   B->info.nz_unneeded = (double)b->maxallocmat;
171   if (realalloc) {
172     ierr = MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
173   }
174   PetscFunctionReturn(0);
175 }
176 
177 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
178 {
179   Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
180   PetscInt    shift;
181 
182   PetscFunctionBegin;
183   if (row < 0 || row >= A->rmap->n) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %D out of range",row);
184   if (nz) *nz = a->rlen[row];
185   shift = a->sliidx[row>>3]+(row&0x07);
186   if (!a->getrowcols) {
187     PetscErrorCode ierr;
188 
189     ierr = PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals);CHKERRQ(ierr);
190   }
191   if (idx) {
192     PetscInt j;
193     for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
194     *idx = a->getrowcols;
195   }
196   if (v) {
197     PetscInt j;
198     for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
199     *v = a->getrowvals;
200   }
201   PetscFunctionReturn(0);
202 }
203 
204 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
205 {
206   PetscFunctionBegin;
207   PetscFunctionReturn(0);
208 }
209 
210 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
211 {
212   Mat            B;
213   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
214   PetscInt       i;
215   PetscErrorCode ierr;
216 
217   PetscFunctionBegin;
218   if (reuse == MAT_REUSE_MATRIX) {
219     B    = *newmat;
220     ierr = MatZeroEntries(B);CHKERRQ(ierr);
221   } else {
222     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
223     ierr = MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N);CHKERRQ(ierr);
224     ierr = MatSetType(B,MATSEQAIJ);CHKERRQ(ierr);
225     ierr = MatSeqAIJSetPreallocation(B,0,a->rlen);CHKERRQ(ierr);
226   }
227 
228   for (i=0; i<A->rmap->n; i++) {
229     PetscInt    nz = 0,*cols = NULL;
230     PetscScalar *vals = NULL;
231 
232     ierr = MatGetRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
233     ierr = MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
234     ierr = MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
235   }
236 
237   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
238   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
239   B->rmap->bs = A->rmap->bs;
240 
241   if (reuse == MAT_INPLACE_MATRIX) {
242     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
243   } else {
244     *newmat = B;
245   }
246   PetscFunctionReturn(0);
247 }
248 
249 #include <../src/mat/impls/aij/seq/aij.h>
250 
251 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
252 {
253   Mat               B;
254   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
255   PetscInt          *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
256   const PetscInt    *cols;
257   const PetscScalar *vals;
258   PetscErrorCode    ierr;
259 
260   PetscFunctionBegin;
261 
262   if (reuse == MAT_REUSE_MATRIX) {
263     B = *newmat;
264   } else {
265     /* Can we just use ilen? */
266     ierr = PetscMalloc1(m,&rowlengths);CHKERRQ(ierr);
267     for (i=0; i<m; i++) {
268       rowlengths[i] = ai[i+1] - ai[i];
269     }
270 
271     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
272     ierr = MatSetSizes(B,m,n,m,n);CHKERRQ(ierr);
273     ierr = MatSetType(B,MATSEQSELL);CHKERRQ(ierr);
274     ierr = MatSeqSELLSetPreallocation(B,0,rowlengths);CHKERRQ(ierr);
275     ierr = PetscFree(rowlengths);CHKERRQ(ierr);
276   }
277 
278   for (row=0; row<m; row++) {
279     ierr = MatGetRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
280     ierr = MatSetValues(B,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
281     ierr = MatRestoreRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
282   }
283   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
284   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
285   B->rmap->bs = A->rmap->bs;
286 
287   if (reuse == MAT_INPLACE_MATRIX) {
288     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
289   } else {
290     *newmat = B;
291   }
292   PetscFunctionReturn(0);
293 }
294 
295 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
296 {
297   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
298   PetscScalar       *y;
299   const PetscScalar *x;
300   const MatScalar   *aval=a->val;
301   PetscInt          totalslices=a->totalslices;
302   const PetscInt    *acolidx=a->colidx;
303   PetscInt          i,j;
304   PetscErrorCode    ierr;
305 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
306   __m512d           vec_x,vec_y,vec_vals;
307   __m256i           vec_idx;
308   __mmask8          mask;
309   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
310   __m256i           vec_idx2,vec_idx3,vec_idx4;
311 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
312   __m128i           vec_idx;
313   __m256d           vec_x,vec_y,vec_y2,vec_vals;
314   MatScalar         yval;
315   PetscInt          r,rows_left,row,nnz_in_row;
316 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
317   __m128d           vec_x_tmp;
318   __m256d           vec_x,vec_y,vec_y2,vec_vals;
319   MatScalar         yval;
320   PetscInt          r,rows_left,row,nnz_in_row;
321 #else
322   PetscScalar       sum[8];
323 #endif
324 
325 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
326 #pragma disjoint(*x,*y,*aval)
327 #endif
328 
329   PetscFunctionBegin;
330   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
331   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
332 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
333   for (i=0; i<totalslices; i++) { /* loop over slices */
334     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
335     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
336 
337     vec_y  = _mm512_setzero_pd();
338     vec_y2 = _mm512_setzero_pd();
339     vec_y3 = _mm512_setzero_pd();
340     vec_y4 = _mm512_setzero_pd();
341 
342     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
343     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
344     case 3:
345       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
346       acolidx += 8; aval += 8;
347       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
348       acolidx += 8; aval += 8;
349       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
350       acolidx += 8; aval += 8;
351       j += 3;
352       break;
353     case 2:
354       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
355       acolidx += 8; aval += 8;
356       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
357       acolidx += 8; aval += 8;
358       j += 2;
359       break;
360     case 1:
361       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
362       acolidx += 8; aval += 8;
363       j += 1;
364       break;
365     }
366     #pragma novector
367     for (; j<(a->sliidx[i+1]>>3); j+=4) {
368       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
369       acolidx += 8; aval += 8;
370       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
371       acolidx += 8; aval += 8;
372       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
373       acolidx += 8; aval += 8;
374       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
375       acolidx += 8; aval += 8;
376     }
377 
378     vec_y = _mm512_add_pd(vec_y,vec_y2);
379     vec_y = _mm512_add_pd(vec_y,vec_y3);
380     vec_y = _mm512_add_pd(vec_y,vec_y4);
381     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
382       mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
383       _mm512_mask_storeu_pd(&y[8*i],mask,vec_y);
384     } else {
385       _mm512_storeu_pd(&y[8*i],vec_y);
386     }
387   }
388 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
389   for (i=0; i<totalslices; i++) { /* loop over full slices */
390     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
391     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
392 
393     /* last slice may have padding rows. Don't use vectorization. */
394     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
395       rows_left = A->rmap->n - 8*i;
396       for (r=0; r<rows_left; ++r) {
397         yval = (MatScalar)0;
398         row = 8*i + r;
399         nnz_in_row = a->rlen[row];
400         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
401         y[row] = yval;
402       }
403       break;
404     }
405 
406     vec_y  = _mm256_setzero_pd();
407     vec_y2 = _mm256_setzero_pd();
408 
409     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
410     #pragma novector
411     #pragma unroll(2)
412     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
413       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
414       aval += 4; acolidx += 4;
415       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
416       aval += 4; acolidx += 4;
417     }
418 
419     _mm256_storeu_pd(y+i*8,vec_y);
420     _mm256_storeu_pd(y+i*8+4,vec_y2);
421   }
422 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
423   for (i=0; i<totalslices; i++) { /* loop over full slices */
424     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
425     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
426 
427     vec_y  = _mm256_setzero_pd();
428     vec_y2 = _mm256_setzero_pd();
429 
430     /* last slice may have padding rows. Don't use vectorization. */
431     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
432       rows_left = A->rmap->n - 8*i;
433       for (r=0; r<rows_left; ++r) {
434         yval = (MatScalar)0;
435         row = 8*i + r;
436         nnz_in_row = a->rlen[row];
437         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
438         y[row] = yval;
439       }
440       break;
441     }
442 
443     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
444     #pragma novector
445     #pragma unroll(2)
446     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
447       vec_vals  = _mm256_loadu_pd(aval);
448       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
449       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
450       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
451       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
452       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
453       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
454       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
455       aval     += 4;
456 
457       vec_vals  = _mm256_loadu_pd(aval);
458       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
459       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
460       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
461       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
462       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
463       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
464       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
465       aval     += 4;
466     }
467 
468     _mm256_storeu_pd(y + i*8,     vec_y);
469     _mm256_storeu_pd(y + i*8 + 4, vec_y2);
470   }
471 #else
472   for (i=0; i<totalslices; i++) { /* loop over slices */
473     for (j=0; j<8; j++) sum[j] = 0.0;
474     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
475       sum[0] += aval[j] * x[acolidx[j]];
476       sum[1] += aval[j+1] * x[acolidx[j+1]];
477       sum[2] += aval[j+2] * x[acolidx[j+2]];
478       sum[3] += aval[j+3] * x[acolidx[j+3]];
479       sum[4] += aval[j+4] * x[acolidx[j+4]];
480       sum[5] += aval[j+5] * x[acolidx[j+5]];
481       sum[6] += aval[j+6] * x[acolidx[j+6]];
482       sum[7] += aval[j+7] * x[acolidx[j+7]];
483     }
484     if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
485       for(j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
486     } else {
487       for(j=0; j<8; j++) y[8*i+j] = sum[j];
488     }
489   }
490 #endif
491 
492   ierr = PetscLogFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); /* theoretical minimal FLOPs */
493   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
494   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
499 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
500 {
501   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
502   PetscScalar       *y,*z;
503   const PetscScalar *x;
504   const MatScalar   *aval=a->val;
505   PetscInt          totalslices=a->totalslices;
506   const PetscInt    *acolidx=a->colidx;
507   PetscInt          i,j;
508   PetscErrorCode    ierr;
509 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
510   __m512d           vec_x,vec_y,vec_vals;
511   __m256i           vec_idx;
512   __mmask8          mask;
513   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
514   __m256i           vec_idx2,vec_idx3,vec_idx4;
515 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
516   __m128d           vec_x_tmp;
517   __m256d           vec_x,vec_y,vec_y2,vec_vals;
518   MatScalar         yval;
519   PetscInt          r,row,nnz_in_row;
520 #else
521   PetscScalar       sum[8];
522 #endif
523 
524 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
525 #pragma disjoint(*x,*y,*aval)
526 #endif
527 
528   PetscFunctionBegin;
529   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
530   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
531 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
532   for (i=0; i<totalslices; i++) { /* loop over slices */
533     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
534     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
535 
536     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
537       mask   = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
538       vec_y  = _mm512_mask_loadu_pd(vec_y,mask,&y[8*i]);
539     } else {
540       vec_y  = _mm512_loadu_pd(&y[8*i]);
541     }
542     vec_y2 = _mm512_setzero_pd();
543     vec_y3 = _mm512_setzero_pd();
544     vec_y4 = _mm512_setzero_pd();
545 
546     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
547     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
548     case 3:
549       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
550       acolidx += 8; aval += 8;
551       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
552       acolidx += 8; aval += 8;
553       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
554       acolidx += 8; aval += 8;
555       j += 3;
556       break;
557     case 2:
558       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
559       acolidx += 8; aval += 8;
560       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
561       acolidx += 8; aval += 8;
562       j += 2;
563       break;
564     case 1:
565       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
566       acolidx += 8; aval += 8;
567       j += 1;
568       break;
569     }
570     #pragma novector
571     for (; j<(a->sliidx[i+1]>>3); j+=4) {
572       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
573       acolidx += 8; aval += 8;
574       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
575       acolidx += 8; aval += 8;
576       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
577       acolidx += 8; aval += 8;
578       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
579       acolidx += 8; aval += 8;
580     }
581 
582     vec_y = _mm512_add_pd(vec_y,vec_y2);
583     vec_y = _mm512_add_pd(vec_y,vec_y3);
584     vec_y = _mm512_add_pd(vec_y,vec_y4);
585     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
586       _mm512_mask_storeu_pd(&z[8*i],mask,vec_y);
587     } else {
588       _mm512_storeu_pd(&z[8*i],vec_y);
589     }
590   }
591 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
592   for (i=0; i<totalslices; i++) { /* loop over full slices */
593     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
594     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
595 
596     /* last slice may have padding rows. Don't use vectorization. */
597     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
598       for (r=0; r<(A->rmap->n & 0x07); ++r) {
599         row        = 8*i + r;
600         yval       = (MatScalar)0.0;
601         nnz_in_row = a->rlen[row];
602         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
603         z[row] = y[row] + yval;
604       }
605       break;
606     }
607 
608     vec_y  = _mm256_loadu_pd(y+8*i);
609     vec_y2 = _mm256_loadu_pd(y+8*i+4);
610 
611     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
612     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
613       vec_vals  = _mm256_loadu_pd(aval);
614       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
615       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
616       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
617       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
618       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
619       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
620       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
621       aval     += 4;
622 
623       vec_vals  = _mm256_loadu_pd(aval);
624       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
625       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
626       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
627       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
628       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
629       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
630       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
631       aval     += 4;
632     }
633 
634     _mm256_storeu_pd(z+i*8,vec_y);
635     _mm256_storeu_pd(z+i*8+4,vec_y2);
636   }
637 #else
638   for (i=0; i<totalslices; i++) { /* loop over slices */
639     for (j=0; j<8; j++) sum[j] = 0.0;
640     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
641       sum[0] += aval[j] * x[acolidx[j]];
642       sum[1] += aval[j+1] * x[acolidx[j+1]];
643       sum[2] += aval[j+2] * x[acolidx[j+2]];
644       sum[3] += aval[j+3] * x[acolidx[j+3]];
645       sum[4] += aval[j+4] * x[acolidx[j+4]];
646       sum[5] += aval[j+5] * x[acolidx[j+5]];
647       sum[6] += aval[j+6] * x[acolidx[j+6]];
648       sum[7] += aval[j+7] * x[acolidx[j+7]];
649     }
650     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
651       for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
652     } else {
653       for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
654     }
655   }
656 #endif
657 
658   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
659   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
660   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
661   PetscFunctionReturn(0);
662 }
663 
664 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
665 {
666   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
667   PetscScalar       *y;
668   const PetscScalar *x;
669   const MatScalar   *aval=a->val;
670   const PetscInt    *acolidx=a->colidx;
671   PetscInt          i,j,r,row,nnz_in_row,totalslices=a->totalslices;
672   PetscErrorCode    ierr;
673 
674 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
675 #pragma disjoint(*x,*y,*aval)
676 #endif
677 
678   PetscFunctionBegin;
679   if (A->symmetric) {
680     ierr = MatMultAdd_SeqSELL(A,xx,zz,yy);CHKERRQ(ierr);
681     PetscFunctionReturn(0);
682   }
683   if (zz != yy) { ierr = VecCopy(zz,yy);CHKERRQ(ierr); }
684   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
685   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
686   for (i=0; i<a->totalslices; i++) { /* loop over slices */
687     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
688       for (r=0; r<(A->rmap->n & 0x07); ++r) {
689         row        = 8*i + r;
690         nnz_in_row = a->rlen[row];
691         for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
692       }
693       break;
694     }
695     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
696       y[acolidx[j]]   += aval[j] * x[8*i];
697       y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
698       y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
699       y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
700       y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
701       y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
702       y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
703       y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
704     }
705   }
706   ierr = PetscLogFlops(2.0*a->sliidx[a->totalslices]);CHKERRQ(ierr);
707   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
708   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
709   PetscFunctionReturn(0);
710 }
711 
712 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
713 {
714   PetscErrorCode ierr;
715 
716   PetscFunctionBegin;
717   if (A->symmetric) {
718     ierr = MatMult_SeqSELL(A,xx,yy);CHKERRQ(ierr);
719   } else {
720     ierr = VecSet(yy,0.0);CHKERRQ(ierr);
721     ierr = MatMultTransposeAdd_SeqSELL(A,xx,yy,yy);CHKERRQ(ierr);
722   }
723   PetscFunctionReturn(0);
724 }
725 
726 /*
727      Checks for missing diagonals
728 */
729 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool  *missing,PetscInt *d)
730 {
731   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
732   PetscInt    *diag,i;
733 
734   PetscFunctionBegin;
735   *missing = PETSC_FALSE;
736   if (A->rmap->n > 0 && !(a->colidx)) {
737     *missing = PETSC_TRUE;
738     if (d) *d = 0;
739     PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n");
740   } else {
741     diag = a->diag;
742     for (i=0; i<A->rmap->n; i++) {
743       if (diag[i] == -1) {
744         *missing = PETSC_TRUE;
745         if (d) *d = i;
746         PetscInfo1(A,"Matrix is missing diagonal number %D\n",i);
747         break;
748       }
749     }
750   }
751   PetscFunctionReturn(0);
752 }
753 
754 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
755 {
756   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
757   PetscInt       i,j,m=A->rmap->n,shift;
758   PetscErrorCode ierr;
759 
760   PetscFunctionBegin;
761   if (!a->diag) {
762     ierr         = PetscMalloc1(m,&a->diag);CHKERRQ(ierr);
763     ierr         = PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt));CHKERRQ(ierr);
764     a->free_diag = PETSC_TRUE;
765   }
766   for (i=0; i<m; i++) { /* loop over rows */
767     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
768     a->diag[i] = -1;
769     for (j=0; j<a->rlen[i]; j++) {
770       if (a->colidx[shift+j*8] == i) {
771         a->diag[i] = shift+j*8;
772         break;
773       }
774     }
775   }
776   PetscFunctionReturn(0);
777 }
778 
779 /*
780   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
781 */
782 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
783 {
784   Mat_SeqSELL    *a=(Mat_SeqSELL*) A->data;
785   PetscInt       i,*diag,m = A->rmap->n;
786   MatScalar      *val = a->val;
787   PetscScalar    *idiag,*mdiag;
788   PetscErrorCode ierr;
789 
790   PetscFunctionBegin;
791   if (a->idiagvalid) PetscFunctionReturn(0);
792   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
793   diag = a->diag;
794   if (!a->idiag) {
795     ierr = PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work);CHKERRQ(ierr);
796     ierr = PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar));CHKERRQ(ierr);
797     val  = a->val;
798   }
799   mdiag = a->mdiag;
800   idiag = a->idiag;
801 
802   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
803     for (i=0; i<m; i++) {
804       mdiag[i] = val[diag[i]];
805       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
806         if (PetscRealPart(fshift)) {
807           ierr = PetscInfo1(A,"Zero diagonal on row %D\n",i);CHKERRQ(ierr);
808           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
809           A->factorerror_zeropivot_value = 0.0;
810           A->factorerror_zeropivot_row   = i;
811         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %D",i);
812       }
813       idiag[i] = 1.0/val[diag[i]];
814     }
815     ierr = PetscLogFlops(m);CHKERRQ(ierr);
816   } else {
817     for (i=0; i<m; i++) {
818       mdiag[i] = val[diag[i]];
819       idiag[i] = omega/(fshift + val[diag[i]]);
820     }
821     ierr = PetscLogFlops(2.0*m);CHKERRQ(ierr);
822   }
823   a->idiagvalid = PETSC_TRUE;
824   PetscFunctionReturn(0);
825 }
826 
827 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
828 {
829   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
830   PetscErrorCode ierr;
831 
832   PetscFunctionBegin;
833   ierr = PetscMemzero(a->val,(a->sliidx[a->totalslices])*sizeof(PetscScalar));CHKERRQ(ierr);
834   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
835   PetscFunctionReturn(0);
836 }
837 
838 PetscErrorCode MatDestroy_SeqSELL(Mat A)
839 {
840   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
841   PetscErrorCode ierr;
842 
843   PetscFunctionBegin;
844 #if defined(PETSC_USE_LOG)
845   PetscLogObjectState((PetscObject)A,"Rows=%D, Cols=%D, NZ=%D",A->rmap->n,A->cmap->n,a->nz);
846 #endif
847   ierr = MatSeqXSELLFreeSELL(A,&a->val,&a->colidx);CHKERRQ(ierr);
848   ierr = ISDestroy(&a->row);CHKERRQ(ierr);
849   ierr = ISDestroy(&a->col);CHKERRQ(ierr);
850   ierr = PetscFree(a->diag);CHKERRQ(ierr);
851   ierr = PetscFree(a->rlen);CHKERRQ(ierr);
852   ierr = PetscFree(a->sliidx);CHKERRQ(ierr);
853   ierr = PetscFree3(a->idiag,a->mdiag,a->ssor_work);CHKERRQ(ierr);
854   ierr = PetscFree(a->solve_work);CHKERRQ(ierr);
855   ierr = ISDestroy(&a->icol);CHKERRQ(ierr);
856   ierr = PetscFree(a->saved_values);CHKERRQ(ierr);
857   ierr = PetscFree2(a->getrowcols,a->getrowvals);CHKERRQ(ierr);
858 
859   ierr = PetscFree(A->data);CHKERRQ(ierr);
860 
861   ierr = PetscObjectChangeTypeName((PetscObject)A,0);CHKERRQ(ierr);
862   ierr = PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL);CHKERRQ(ierr);
863   ierr = PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL);CHKERRQ(ierr);
864 #if defined(PETSC_HAVE_ELEMENTAL)
865 #endif
866   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL);CHKERRQ(ierr);
867   PetscFunctionReturn(0);
868 }
869 
870 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
871 {
872   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
873   PetscErrorCode ierr;
874 
875   PetscFunctionBegin;
876   switch (op) {
877   case MAT_ROW_ORIENTED:
878     a->roworiented = flg;
879     break;
880   case MAT_KEEP_NONZERO_PATTERN:
881     a->keepnonzeropattern = flg;
882     break;
883   case MAT_NEW_NONZERO_LOCATIONS:
884     a->nonew = (flg ? 0 : 1);
885     break;
886   case MAT_NEW_NONZERO_LOCATION_ERR:
887     a->nonew = (flg ? -1 : 0);
888     break;
889   case MAT_NEW_NONZERO_ALLOCATION_ERR:
890     a->nonew = (flg ? -2 : 0);
891     break;
892   case MAT_UNUSED_NONZERO_LOCATION_ERR:
893     a->nounused = (flg ? -1 : 0);
894     break;
895   case MAT_NEW_DIAGONALS:
896   case MAT_IGNORE_OFF_PROC_ENTRIES:
897   case MAT_USE_HASH_TABLE:
898     ierr = PetscInfo1(A,"Option %s ignored\n",MatOptions[op]);CHKERRQ(ierr);
899     break;
900   case MAT_SPD:
901   case MAT_SYMMETRIC:
902   case MAT_STRUCTURALLY_SYMMETRIC:
903   case MAT_HERMITIAN:
904   case MAT_SYMMETRY_ETERNAL:
905     /* These options are handled directly by MatSetOption() */
906     break;
907   default:
908     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
909   }
910   PetscFunctionReturn(0);
911 }
912 
913 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
914 {
915   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
916   PetscInt       i,j,n,shift;
917   PetscScalar    *x,zero=0.0;
918   PetscErrorCode ierr;
919 
920   PetscFunctionBegin;
921   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
922   if (n != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
923 
924   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
925     PetscInt *diag=a->diag;
926     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
927     for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
928     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
929     PetscFunctionReturn(0);
930   }
931 
932   ierr = VecSet(v,zero);CHKERRQ(ierr);
933   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
934   for (i=0; i<n; i++) { /* loop over rows */
935     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
936     x[i] = 0;
937     for (j=0; j<a->rlen[i]; j++) {
938       if (a->colidx[shift+j*8] == i) {
939         x[i] = a->val[shift+j*8];
940         break;
941       }
942     }
943   }
944   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
945   PetscFunctionReturn(0);
946 }
947 
948 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
949 {
950   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
951   const PetscScalar *l,*r;
952   PetscInt          i,j,m,n,row;
953   PetscErrorCode    ierr;
954 
955   PetscFunctionBegin;
956   if (ll) {
957     /* The local size is used so that VecMPI can be passed to this routine
958        by MatDiagonalScale_MPISELL */
959     ierr = VecGetLocalSize(ll,&m);CHKERRQ(ierr);
960     if (m != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
961     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
962     for (i=0; i<a->totalslices; i++) { /* loop over slices */
963       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
964         a->val[j] *= l[8*i+row];
965       }
966     }
967     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
968     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
969   }
970   if (rr) {
971     ierr = VecGetLocalSize(rr,&n);CHKERRQ(ierr);
972     if (n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
973     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
974     for (i=0; i<a->totalslices; i++) { /* loop over slices */
975       for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
976         a->val[j] *= r[a->colidx[j]];
977       }
978     }
979     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
980     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
981   }
982   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
983   PetscFunctionReturn(0);
984 }
985 
986 extern PetscErrorCode MatSetValues_SeqSELL(Mat,PetscInt,const PetscInt[],PetscInt,const PetscInt[],const PetscScalar[],InsertMode);
987 
988 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
989 {
990   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
991   PetscInt    *cp,i,k,low,high,t,row,col,l;
992   PetscInt    shift;
993   MatScalar   *vp;
994 
995   PetscFunctionBegin;
996   for (k=0; k<m; k++) { /* loop over requested rows */
997     row = im[k];
998     if (row<0) continue;
999 #if defined(PETSC_USE_DEBUG)
1000     if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1001 #endif
1002     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1003     cp = a->colidx+shift; /* pointer to the row */
1004     vp = a->val+shift; /* pointer to the row */
1005     for (l=0; l<n; l++) { /* loop over requested columns */
1006       col = in[l];
1007       if (col<0) continue;
1008 #if defined(PETSC_USE_DEBUG)
1009       if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %D max %D",col,A->cmap->n-1);
1010 #endif
1011       high = a->rlen[row]; low = 0; /* assume unsorted */
1012       while (high-low > 5) {
1013         t = (low+high)/2;
1014         if (*(cp+t*8) > col) high = t;
1015         else low = t;
1016       }
1017       for (i=low; i<high; i++) {
1018         if (*(cp+8*i) > col) break;
1019         if (*(cp+8*i) == col) {
1020           *v++ = *(vp+8*i);
1021           goto finished;
1022         }
1023       }
1024       *v++ = 0.0;
1025     finished:;
1026     }
1027   }
1028   PetscFunctionReturn(0);
1029 }
1030 
1031 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1032 {
1033   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1034   PetscInt          i,j,m=A->rmap->n,shift;
1035   const char        *name;
1036   PetscViewerFormat format;
1037   PetscErrorCode    ierr;
1038 
1039   PetscFunctionBegin;
1040   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1041   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1042     PetscInt nofinalvalue = 0;
1043     /*
1044     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1045       nofinalvalue = 1;
1046     }
1047     */
1048     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1049     ierr = PetscViewerASCIIPrintf(viewer,"%% Size = %D %D \n",m,A->cmap->n);CHKERRQ(ierr);
1050     ierr = PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %D \n",a->nz);CHKERRQ(ierr);
1051 #if defined(PETSC_USE_COMPLEX)
1052     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,4);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1053 #else
1054     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,3);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1055 #endif
1056     ierr = PetscViewerASCIIPrintf(viewer,"zzz = [\n");CHKERRQ(ierr);
1057 
1058     for (i=0; i<m; i++) {
1059       shift = a->sliidx[i>>3]+(i&0x07);
1060       for (j=0; j<a->rlen[i]; j++) {
1061 #if defined(PETSC_USE_COMPLEX)
1062         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1063 #else
1064         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1065 #endif
1066       }
1067     }
1068     /*
1069     if (nofinalvalue) {
1070 #if defined(PETSC_USE_COMPLEX)
1071       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",m,A->cmap->n,0.,0.);CHKERRQ(ierr);
1072 #else
1073       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",m,A->cmap->n,0.0);CHKERRQ(ierr);
1074 #endif
1075     }
1076     */
1077     ierr = PetscObjectGetName((PetscObject)A,&name);CHKERRQ(ierr);
1078     ierr = PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name);CHKERRQ(ierr);
1079     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1080   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1081     PetscFunctionReturn(0);
1082   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1083     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1084     for (i=0; i<m; i++) {
1085       ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1086       shift = a->sliidx[i>>3]+(i&0x07);
1087       for (j=0; j<a->rlen[i]; j++) {
1088 #if defined(PETSC_USE_COMPLEX)
1089         if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1090           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1091         } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1092           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1093         } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1094           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1095         }
1096 #else
1097         if (a->val[shift+8*j] != 0.0) {ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);}
1098 #endif
1099       }
1100       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1101     }
1102     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1103   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1104     PetscInt    cnt=0,jcnt;
1105     PetscScalar value;
1106 #if defined(PETSC_USE_COMPLEX)
1107     PetscBool   realonly=PETSC_TRUE;
1108     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1109       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1110         realonly = PETSC_FALSE;
1111         break;
1112       }
1113     }
1114 #endif
1115 
1116     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1117     for (i=0; i<m; i++) {
1118       jcnt = 0;
1119       shift = a->sliidx[i>>3]+(i&0x07);
1120       for (j=0; j<A->cmap->n; j++) {
1121         if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1122           value = a->val[cnt++];
1123           jcnt++;
1124         } else {
1125           value = 0.0;
1126         }
1127 #if defined(PETSC_USE_COMPLEX)
1128         if (realonly) {
1129           ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value));CHKERRQ(ierr);
1130         } else {
1131           ierr = PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value));CHKERRQ(ierr);
1132         }
1133 #else
1134         ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value);CHKERRQ(ierr);
1135 #endif
1136       }
1137       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1138     }
1139     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1140   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1141     PetscInt fshift=1;
1142     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1143 #if defined(PETSC_USE_COMPLEX)
1144     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n");CHKERRQ(ierr);
1145 #else
1146     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n");CHKERRQ(ierr);
1147 #endif
1148     ierr = PetscViewerASCIIPrintf(viewer,"%D %D %D\n", m, A->cmap->n, a->nz);CHKERRQ(ierr);
1149     for (i=0; i<m; i++) {
1150       shift = a->sliidx[i>>3]+(i&0x07);
1151       for (j=0; j<a->rlen[i]; j++) {
1152 #if defined(PETSC_USE_COMPLEX)
1153         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1154 #else
1155         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1156 #endif
1157       }
1158     }
1159     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1160   } else if (format == PETSC_VIEWER_NATIVE) {
1161     for (i=0; i<a->totalslices; i++) { /* loop over slices */
1162       PetscInt row;
1163       ierr = PetscViewerASCIIPrintf(viewer,"slice %D: %D %D\n",i,a->sliidx[i],a->sliidx[i+1]);CHKERRQ(ierr);
1164       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1165 #if defined(PETSC_USE_COMPLEX)
1166         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1167           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1168         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1169           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1170         } else {
1171           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1172         }
1173 #else
1174         ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1175 #endif
1176       }
1177     }
1178   } else {
1179     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1180     if (A->factortype) {
1181       for (i=0; i<m; i++) {
1182         shift = a->sliidx[i>>3]+(i&0x07);
1183         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1184         /* L part */
1185         for (j=shift; j<a->diag[i]; j+=8) {
1186 #if defined(PETSC_USE_COMPLEX)
1187           if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1188             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1189           } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1190             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1191           } else {
1192             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1193           }
1194 #else
1195           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1196 #endif
1197         }
1198         /* diagonal */
1199         j = a->diag[i];
1200 #if defined(PETSC_USE_COMPLEX)
1201         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1202           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]));CHKERRQ(ierr);
1203         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1204           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])));CHKERRQ(ierr);
1205         } else {
1206           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]));CHKERRQ(ierr);
1207         }
1208 #else
1209         ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)(1.0/a->val[j]));CHKERRQ(ierr);
1210 #endif
1211 
1212         /* U part */
1213         for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1214 #if defined(PETSC_USE_COMPLEX)
1215           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1216             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1217           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1218             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1219           } else {
1220             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1221           }
1222 #else
1223           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1224 #endif
1225         }
1226         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1227       }
1228     } else {
1229       for (i=0; i<m; i++) {
1230         shift = a->sliidx[i>>3]+(i&0x07);
1231         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1232         for (j=0; j<a->rlen[i]; j++) {
1233 #if defined(PETSC_USE_COMPLEX)
1234           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1235             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1236           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1237             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1238           } else {
1239             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1240           }
1241 #else
1242           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);
1243 #endif
1244         }
1245         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1246       }
1247     }
1248     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1249   }
1250   ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
1251   PetscFunctionReturn(0);
1252 }
1253 
1254 #include <petscdraw.h>
1255 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1256 {
1257   Mat               A=(Mat)Aa;
1258   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1259   PetscInt          i,j,m=A->rmap->n,shift;
1260   int               color;
1261   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1262   PetscViewer       viewer;
1263   PetscViewerFormat format;
1264   PetscErrorCode    ierr;
1265 
1266   PetscFunctionBegin;
1267   ierr = PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer);CHKERRQ(ierr);
1268   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1269   ierr = PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr);CHKERRQ(ierr);
1270 
1271   /* loop over matrix elements drawing boxes */
1272 
1273   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1274     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1275     /* Blue for negative, Cyan for zero and  Red for positive */
1276     color = PETSC_DRAW_BLUE;
1277     for (i=0; i<m; i++) {
1278       shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1279       y_l = m - i - 1.0; y_r = y_l + 1.0;
1280       for (j=0; j<a->rlen[i]; j++) {
1281         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1282         if (PetscRealPart(a->val[shift+8*j]) >=  0.) continue;
1283         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1284       }
1285     }
1286     color = PETSC_DRAW_CYAN;
1287     for (i=0; i<m; i++) {
1288       shift = a->sliidx[i>>3]+(i&0x07);
1289       y_l = m - i - 1.0; y_r = y_l + 1.0;
1290       for (j=0; j<a->rlen[i]; j++) {
1291         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1292         if (a->val[shift+8*j] !=  0.) continue;
1293         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1294       }
1295     }
1296     color = PETSC_DRAW_RED;
1297     for (i=0; i<m; i++) {
1298       shift = a->sliidx[i>>3]+(i&0x07);
1299       y_l = m - i - 1.0; y_r = y_l + 1.0;
1300       for (j=0; j<a->rlen[i]; j++) {
1301         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1302         if (PetscRealPart(a->val[shift+8*j]) <=  0.) continue;
1303         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1304       }
1305     }
1306     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1307   } else {
1308     /* use contour shading to indicate magnitude of values */
1309     /* first determine max of all nonzero values */
1310     PetscReal minv=0.0,maxv=0.0;
1311     PetscInt  count=0;
1312     PetscDraw popup;
1313     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1314       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1315     }
1316     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1317     ierr = PetscDrawGetPopup(draw,&popup);CHKERRQ(ierr);
1318     ierr = PetscDrawScalePopup(popup,minv,maxv);CHKERRQ(ierr);
1319 
1320     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1321     for (i=0; i<m; i++) {
1322       shift = a->sliidx[i>>3]+(i&0x07);
1323       y_l = m - i - 1.0;
1324       y_r = y_l + 1.0;
1325       for (j=0; j<a->rlen[i]; j++) {
1326         x_l = a->colidx[shift+j*8];
1327         x_r = x_l + 1.0;
1328         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1329         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1330         count++;
1331       }
1332     }
1333     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1334   }
1335   PetscFunctionReturn(0);
1336 }
1337 
1338 #include <petscdraw.h>
1339 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1340 {
1341   PetscDraw      draw;
1342   PetscReal      xr,yr,xl,yl,h,w;
1343   PetscBool      isnull;
1344   PetscErrorCode ierr;
1345 
1346   PetscFunctionBegin;
1347   ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
1348   ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr);
1349   if (isnull) PetscFunctionReturn(0);
1350 
1351   xr   = A->cmap->n; yr  = A->rmap->n; h = yr/10.0; w = xr/10.0;
1352   xr  += w;          yr += h;         xl = -w;     yl = -h;
1353   ierr = PetscDrawSetCoordinates(draw,xl,yl,xr,yr);CHKERRQ(ierr);
1354   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer);CHKERRQ(ierr);
1355   ierr = PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A);CHKERRQ(ierr);
1356   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL);CHKERRQ(ierr);
1357   ierr = PetscDrawSave(draw);CHKERRQ(ierr);
1358   PetscFunctionReturn(0);
1359 }
1360 
1361 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1362 {
1363   PetscBool      iascii,isbinary,isdraw;
1364   PetscErrorCode ierr;
1365 
1366   PetscFunctionBegin;
1367   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
1368   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary);CHKERRQ(ierr);
1369   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw);CHKERRQ(ierr);
1370   if (iascii) {
1371     ierr = MatView_SeqSELL_ASCII(A,viewer);CHKERRQ(ierr);
1372   } else if (isbinary) {
1373     /* ierr = MatView_SeqSELL_Binary(A,viewer);CHKERRQ(ierr); */
1374   } else if (isdraw) {
1375     ierr = MatView_SeqSELL_Draw(A,viewer);CHKERRQ(ierr);
1376   }
1377   PetscFunctionReturn(0);
1378 }
1379 
1380 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1381 {
1382   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1383   PetscInt       i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1384   MatScalar      *vp;
1385   PetscErrorCode ierr;
1386 
1387   PetscFunctionBegin;
1388   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1389   /* To do: compress out the unused elements */
1390   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
1391   ierr = PetscInfo6(A,"Matrix size: %D X %D; storage space: %D allocated %D used (%D nonzeros+%D paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz);CHKERRQ(ierr);
1392   ierr = PetscInfo1(A,"Number of mallocs during MatSetValues() is %D\n",a->reallocs);CHKERRQ(ierr);
1393   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rlenmax);CHKERRQ(ierr);
1394   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1395   for (i=0; i<a->totalslices; ++i) {
1396     shift = a->sliidx[i];    /* starting index of the slice */
1397     cp    = a->colidx+shift; /* pointer to the column indices of the slice */
1398     vp    = a->val+shift;    /* pointer to the nonzero values of the slice */
1399     for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1400       row  = 8*i + row_in_slice;
1401       nrow = a->rlen[row]; /* number of nonzeros in row */
1402       /*
1403         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1404         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1405       */
1406       lastcol = 0;
1407       if (nrow>0) { /* nonempty row */
1408         lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1409       } else if (!row_in_slice) { /* first row of the currect slice is empty */
1410         for (j=1;j<8;j++) {
1411           if (a->rlen[8*i+j]) {
1412             lastcol = cp[j];
1413             break;
1414           }
1415         }
1416       } else {
1417         if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1418       }
1419 
1420       for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1421         cp[8*k+row_in_slice] = lastcol;
1422         vp[8*k+row_in_slice] = (MatScalar)0;
1423       }
1424     }
1425   }
1426 
1427   A->info.mallocs += a->reallocs;
1428   a->reallocs      = 0;
1429 
1430   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1431   PetscFunctionReturn(0);
1432 }
1433 
1434 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1435 {
1436   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1437 
1438   PetscFunctionBegin;
1439   info->block_size   = 1.0;
1440   info->nz_allocated = (double)a->maxallocmat;
1441   info->nz_used      = (double)a->sliidx[a->totalslices]; /* include padding zeros */
1442   info->nz_unneeded  = (double)(a->maxallocmat-a->sliidx[a->totalslices]);
1443   info->assemblies   = (double)A->num_ass;
1444   info->mallocs      = (double)A->info.mallocs;
1445   info->memory       = ((PetscObject)A)->mem;
1446   if (A->factortype) {
1447     info->fill_ratio_given  = A->info.fill_ratio_given;
1448     info->fill_ratio_needed = A->info.fill_ratio_needed;
1449     info->factor_mallocs    = A->info.factor_mallocs;
1450   } else {
1451     info->fill_ratio_given  = 0;
1452     info->fill_ratio_needed = 0;
1453     info->factor_mallocs    = 0;
1454   }
1455   PetscFunctionReturn(0);
1456 }
1457 
1458 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1459 {
1460   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1461   PetscInt       shift,i,k,l,low,high,t,ii,row,col,nrow;
1462   PetscInt       *cp,nonew=a->nonew,lastcol=-1;
1463   MatScalar      *vp,value;
1464   PetscErrorCode ierr;
1465 
1466   PetscFunctionBegin;
1467   for (k=0; k<m; k++) { /* loop over added rows */
1468     row = im[k];
1469     if (row < 0) continue;
1470 #if defined(PETSC_USE_DEBUG)
1471     if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1472 #endif
1473     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1474     cp    = a->colidx+shift; /* pointer to the row */
1475     vp    = a->val+shift; /* pointer to the row */
1476     nrow  = a->rlen[row];
1477     low   = 0;
1478     high  = nrow;
1479 
1480     for (l=0; l<n; l++) { /* loop over added columns */
1481       col = in[l];
1482       if (col<0) continue;
1483 #if defined(PETSC_USE_DEBUG)
1484       if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %D max %D",col,A->cmap->n-1);
1485 #endif
1486       if (a->roworiented) {
1487         value = v[l+k*n];
1488       } else {
1489         value = v[k+l*m];
1490       }
1491       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1492 
1493       /* search in this row for the specified colmun, i indicates the column to be set */
1494       if (col <= lastcol) low = 0;
1495       else high = nrow;
1496       lastcol = col;
1497       while (high-low > 5) {
1498         t = (low+high)/2;
1499         if (*(cp+t*8) > col) high = t;
1500         else low = t;
1501       }
1502       for (i=low; i<high; i++) {
1503         if (*(cp+i*8) > col) break;
1504         if (*(cp+i*8) == col) {
1505           if (is == ADD_VALUES) *(vp+i*8) += value;
1506           else *(vp+i*8) = value;
1507           low = i + 1;
1508           goto noinsert;
1509         }
1510       }
1511       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1512       if (nonew == 1) goto noinsert;
1513       if (nonew == -1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%D, %D) in the matrix", row, col);
1514       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1515       MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1516       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1517       for (ii=nrow-1; ii>=i; ii--) {
1518         *(cp+(ii+1)*8) = *(cp+ii*8);
1519         *(vp+(ii+1)*8) = *(vp+ii*8);
1520       }
1521       a->rlen[row]++;
1522       *(cp+i*8) = col;
1523       *(vp+i*8) = value;
1524       a->nz++;
1525       A->nonzerostate++;
1526       low = i+1; high++; nrow++;
1527 noinsert:;
1528     }
1529     a->rlen[row] = nrow;
1530   }
1531   PetscFunctionReturn(0);
1532 }
1533 
1534 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1535 {
1536   PetscErrorCode ierr;
1537 
1538   PetscFunctionBegin;
1539   /* If the two matrices have the same copy implementation, use fast copy. */
1540   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1541     Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1542     Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;
1543 
1544     if (a->sliidx[a->totalslices] != b->sliidx[b->totalslices]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different");
1545     ierr = PetscMemcpy(b->val,a->val,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1546   } else {
1547     ierr = MatCopy_Basic(A,B,str);CHKERRQ(ierr);
1548   }
1549   PetscFunctionReturn(0);
1550 }
1551 
1552 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1553 {
1554   PetscErrorCode ierr;
1555 
1556   PetscFunctionBegin;
1557   ierr = MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,0);CHKERRQ(ierr);
1558   PetscFunctionReturn(0);
1559 }
1560 
1561 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1562 {
1563   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1564 
1565   PetscFunctionBegin;
1566   *array = a->val;
1567   PetscFunctionReturn(0);
1568 }
1569 
1570 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1571 {
1572   PetscFunctionBegin;
1573   PetscFunctionReturn(0);
1574 }
1575 
1576 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1577 {
1578   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1579   PetscInt    i;
1580   MatScalar   *aval=a->val;
1581 
1582   PetscFunctionBegin;
1583   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1584   PetscFunctionReturn(0);
1585 }
1586 
1587 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1588 {
1589   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1590   PetscInt       i;
1591   MatScalar      *aval=a->val;
1592   PetscErrorCode ierr;
1593 
1594   PetscFunctionBegin;
1595   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1596   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1597   PetscFunctionReturn(0);
1598 }
1599 
1600 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1601 {
1602   Mat_SeqSELL    *a=(Mat_SeqSELL*)inA->data;
1603   MatScalar      *aval=a->val;
1604   PetscScalar    oalpha=alpha;
1605   PetscBLASInt   one=1,size;
1606   PetscErrorCode ierr;
1607 
1608   PetscFunctionBegin;
1609   ierr = PetscBLASIntCast(a->sliidx[a->totalslices],&size);CHKERRQ(ierr);
1610   PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1611   ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1612   ierr = MatSeqSELLInvalidateDiagonal(inA);CHKERRQ(ierr);
1613   PetscFunctionReturn(0);
1614 }
1615 
1616 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1617 {
1618   Mat_SeqSELL    *y=(Mat_SeqSELL*)Y->data;
1619   PetscErrorCode ierr;
1620 
1621   PetscFunctionBegin;
1622   if (!Y->preallocated || !y->nz) {
1623     ierr = MatSeqSELLSetPreallocation(Y,1,NULL);CHKERRQ(ierr);
1624   }
1625   ierr = MatShift_Basic(Y,a);CHKERRQ(ierr);
1626   PetscFunctionReturn(0);
1627 }
1628 
1629 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1630 {
1631   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1632   PetscScalar       *x,sum,*t;
1633   const MatScalar   *idiag=0,*mdiag;
1634   const PetscScalar *b,*xb;
1635   PetscInt          n,m=A->rmap->n,i,j,shift;
1636   const PetscInt    *diag;
1637   PetscErrorCode    ierr;
1638 
1639   PetscFunctionBegin;
1640   its = its*lits;
1641 
1642   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1643   if (!a->idiagvalid) {ierr = MatInvertDiagonal_SeqSELL(A,omega,fshift);CHKERRQ(ierr);}
1644   a->fshift = fshift;
1645   a->omega  = omega;
1646 
1647   diag  = a->diag;
1648   t     = a->ssor_work;
1649   idiag = a->idiag;
1650   mdiag = a->mdiag;
1651 
1652   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1653   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1654   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1655   if (flag == SOR_APPLY_UPPER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented");
1656   if (flag == SOR_APPLY_LOWER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented");
1657   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
1658 
1659   if (flag & SOR_ZERO_INITIAL_GUESS) {
1660     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1661       for (i=0; i<m; i++) {
1662         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1663         sum   = b[i];
1664         n     = (diag[i]-shift)/8;
1665         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1666         t[i]  = sum;
1667         x[i]  = sum*idiag[i];
1668       }
1669       xb   = t;
1670       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1671     } else xb = b;
1672     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1673       for (i=m-1; i>=0; i--) {
1674         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1675         sum   = xb[i];
1676         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1677         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1678         if (xb == b) {
1679           x[i] = sum*idiag[i];
1680         } else {
1681           x[i] = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1682         }
1683       }
1684       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1685     }
1686     its--;
1687   }
1688   while (its--) {
1689     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1690       for (i=0; i<m; i++) {
1691         /* lower */
1692         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1693         sum   = b[i];
1694         n     = (diag[i]-shift)/8;
1695         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1696         t[i]  = sum;             /* save application of the lower-triangular part */
1697         /* upper */
1698         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1699         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1700         x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1701       }
1702       xb   = t;
1703       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1704     } else xb = b;
1705     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1706       for (i=m-1; i>=0; i--) {
1707         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1708         sum = xb[i];
1709         if (xb == b) {
1710           /* whole matrix (no checkpointing available) */
1711           n     = a->rlen[i];
1712           for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1713           x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1714         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1715           n     = a->rlen[i]-(diag[i]-shift)/8-1;
1716           for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1717           x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1718         }
1719       }
1720       if (xb == b) {
1721         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1722       } else {
1723         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1724       }
1725     }
1726   }
1727   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1728   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1729   PetscFunctionReturn(0);
1730 }
1731 
1732 /* -------------------------------------------------------------------*/
1733 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1734                                        MatGetRow_SeqSELL,
1735                                        MatRestoreRow_SeqSELL,
1736                                        MatMult_SeqSELL,
1737                                /* 4*/  MatMultAdd_SeqSELL,
1738                                        MatMultTranspose_SeqSELL,
1739                                        MatMultTransposeAdd_SeqSELL,
1740                                        0,
1741                                        0,
1742                                        0,
1743                                /* 10*/ 0,
1744                                        0,
1745                                        0,
1746                                        MatSOR_SeqSELL,
1747                                        0,
1748                                /* 15*/ MatGetInfo_SeqSELL,
1749                                        MatEqual_SeqSELL,
1750                                        MatGetDiagonal_SeqSELL,
1751                                        MatDiagonalScale_SeqSELL,
1752                                        0,
1753                                /* 20*/ 0,
1754                                        MatAssemblyEnd_SeqSELL,
1755                                        MatSetOption_SeqSELL,
1756                                        MatZeroEntries_SeqSELL,
1757                                /* 24*/ 0,
1758                                        0,
1759                                        0,
1760                                        0,
1761                                        0,
1762                                /* 29*/ MatSetUp_SeqSELL,
1763                                        0,
1764                                        0,
1765                                        0,
1766                                        0,
1767                                /* 34*/ MatDuplicate_SeqSELL,
1768                                        0,
1769                                        0,
1770                                        0,
1771                                        0,
1772                                /* 39*/ 0,
1773                                        0,
1774                                        0,
1775                                        MatGetValues_SeqSELL,
1776                                        MatCopy_SeqSELL,
1777                                /* 44*/ 0,
1778                                        MatScale_SeqSELL,
1779                                        MatShift_SeqSELL,
1780                                        0,
1781                                        0,
1782                                /* 49*/ 0,
1783                                        0,
1784                                        0,
1785                                        0,
1786                                        0,
1787                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
1788                                        0,
1789                                        0,
1790                                        0,
1791                                        0,
1792                                /* 59*/ 0,
1793                                        MatDestroy_SeqSELL,
1794                                        MatView_SeqSELL,
1795                                        0,
1796                                        0,
1797                                /* 64*/ 0,
1798                                        0,
1799                                        0,
1800                                        0,
1801                                        0,
1802                                /* 69*/ 0,
1803                                        0,
1804                                        0,
1805                                        0,
1806                                        0,
1807                                /* 74*/ 0,
1808                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1809                                        0,
1810                                        0,
1811                                        0,
1812                                /* 79*/ 0,
1813                                        0,
1814                                        0,
1815                                        0,
1816                                        0,
1817                                /* 84*/ 0,
1818                                        0,
1819                                        0,
1820                                        0,
1821                                        0,
1822                                /* 89*/ 0,
1823                                        0,
1824                                        0,
1825                                        0,
1826                                        0,
1827                                /* 94*/ 0,
1828                                        0,
1829                                        0,
1830                                        0,
1831                                        0,
1832                                /* 99*/ 0,
1833                                        0,
1834                                        0,
1835                                        MatConjugate_SeqSELL,
1836                                        0,
1837                                /*104*/ 0,
1838                                        0,
1839                                        0,
1840                                        0,
1841                                        0,
1842                                /*109*/ 0,
1843                                        0,
1844                                        0,
1845                                        0,
1846                                        MatMissingDiagonal_SeqSELL,
1847                                /*114*/ 0,
1848                                        0,
1849                                        0,
1850                                        0,
1851                                        0,
1852                                /*119*/ 0,
1853                                        0,
1854                                        0,
1855                                        0,
1856                                        0,
1857                                /*124*/ 0,
1858                                        0,
1859                                        0,
1860                                        0,
1861                                        0,
1862                                /*129*/ 0,
1863                                        0,
1864                                        0,
1865                                        0,
1866                                        0,
1867                                /*134*/ 0,
1868                                        0,
1869                                        0,
1870                                        0,
1871                                        0,
1872                                /*139*/ 0,
1873                                        0,
1874                                        0,
1875                                        MatFDColoringSetUp_SeqXAIJ,
1876                                        0,
1877                                 /*144*/0
1878 };
1879 
1880 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1881 {
1882   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1883   PetscErrorCode ierr;
1884 
1885   PetscFunctionBegin;
1886   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1887 
1888   /* allocate space for values if not already there */
1889   if (!a->saved_values) {
1890     ierr = PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values);CHKERRQ(ierr);
1891     ierr = PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1892   }
1893 
1894   /* copy values over */
1895   ierr = PetscMemcpy(a->saved_values,a->val,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1896   PetscFunctionReturn(0);
1897 }
1898 
1899 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1900 {
1901   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1902   PetscErrorCode ierr;
1903 
1904   PetscFunctionBegin;
1905   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1906   if (!a->saved_values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first");
1907   /* copy values over */
1908   ierr = PetscMemcpy(a->val,a->saved_values,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1909   PetscFunctionReturn(0);
1910 }
1911 
1912 /*@C
1913  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1914 
1915  Not Collective
1916 
1917  Input Parameters:
1918  .  mat - a MATSEQSELL matrix
1919  .  array - pointer to the data
1920 
1921  Level: intermediate
1922 
1923  .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90()
1924  @*/
1925 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1926 {
1927   PetscErrorCode ierr;
1928 
1929   PetscFunctionBegin;
1930   ierr = PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));CHKERRQ(ierr);
1931   PetscFunctionReturn(0);
1932 }
1933 
1934 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1935 {
1936   Mat_SeqSELL    *b;
1937   PetscMPIInt    size;
1938   PetscErrorCode ierr;
1939 
1940   PetscFunctionBegin;
1941   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr);
1942   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1");
1943 
1944   ierr = PetscNewLog(B,&b);CHKERRQ(ierr);
1945 
1946   B->data = (void*)b;
1947 
1948   ierr = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1949 
1950   b->row                = 0;
1951   b->col                = 0;
1952   b->icol               = 0;
1953   b->reallocs           = 0;
1954   b->ignorezeroentries  = PETSC_FALSE;
1955   b->roworiented        = PETSC_TRUE;
1956   b->nonew              = 0;
1957   b->diag               = 0;
1958   b->solve_work         = 0;
1959   B->spptr              = 0;
1960   b->saved_values       = 0;
1961   b->idiag              = 0;
1962   b->mdiag              = 0;
1963   b->ssor_work          = 0;
1964   b->omega              = 1.0;
1965   b->fshift             = 0.0;
1966   b->idiagvalid         = PETSC_FALSE;
1967   b->keepnonzeropattern = PETSC_FALSE;
1968 
1969   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL);CHKERRQ(ierr);
1970   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL);CHKERRQ(ierr);
1971   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL);CHKERRQ(ierr);
1972   ierr = PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL);CHKERRQ(ierr);
1973   ierr = PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL);CHKERRQ(ierr);
1974   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL);CHKERRQ(ierr);
1975   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ);CHKERRQ(ierr);
1976   PetscFunctionReturn(0);
1977 }
1978 
1979 /*
1980  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1981  */
1982 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
1983 {
1984   Mat_SeqSELL    *c,*a=(Mat_SeqSELL*)A->data;
1985   PetscInt       i,m=A->rmap->n;
1986   PetscInt       totalslices=a->totalslices;
1987   PetscErrorCode ierr;
1988 
1989   PetscFunctionBegin;
1990   c = (Mat_SeqSELL*)C->data;
1991 
1992   C->factortype = A->factortype;
1993   c->row        = 0;
1994   c->col        = 0;
1995   c->icol       = 0;
1996   c->reallocs   = 0;
1997 
1998   C->assembled = PETSC_TRUE;
1999 
2000   ierr = PetscLayoutReference(A->rmap,&C->rmap);CHKERRQ(ierr);
2001   ierr = PetscLayoutReference(A->cmap,&C->cmap);CHKERRQ(ierr);
2002 
2003   ierr = PetscMalloc1(8*totalslices,&c->rlen);CHKERRQ(ierr);
2004   ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2005   ierr = PetscMalloc1(totalslices+1,&c->sliidx);CHKERRQ(ierr);
2006   ierr = PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
2007 
2008   for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
2009   for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];
2010 
2011   /* allocate the matrix space */
2012   if (mallocmatspace) {
2013     ierr = PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx);CHKERRQ(ierr);
2014     ierr = PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
2015 
2016     c->singlemalloc = PETSC_TRUE;
2017 
2018     if (m > 0) {
2019       ierr = PetscMemcpy(c->colidx,a->colidx,(a->maxallocmat)*sizeof(PetscInt));CHKERRQ(ierr);
2020       if (cpvalues == MAT_COPY_VALUES) {
2021         ierr = PetscMemcpy(c->val,a->val,a->maxallocmat*sizeof(PetscScalar));CHKERRQ(ierr);
2022       } else {
2023         ierr = PetscMemzero(c->val,a->maxallocmat*sizeof(PetscScalar));CHKERRQ(ierr);
2024       }
2025     }
2026   }
2027 
2028   c->ignorezeroentries = a->ignorezeroentries;
2029   c->roworiented       = a->roworiented;
2030   c->nonew             = a->nonew;
2031   if (a->diag) {
2032     ierr = PetscMalloc1(m,&c->diag);CHKERRQ(ierr);
2033     ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2034     for (i=0; i<m; i++) {
2035       c->diag[i] = a->diag[i];
2036     }
2037   } else c->diag = 0;
2038 
2039   c->solve_work         = 0;
2040   c->saved_values       = 0;
2041   c->idiag              = 0;
2042   c->ssor_work          = 0;
2043   c->keepnonzeropattern = a->keepnonzeropattern;
2044   c->free_val           = PETSC_TRUE;
2045   c->free_colidx        = PETSC_TRUE;
2046 
2047   c->maxallocmat  = a->maxallocmat;
2048   c->maxallocrow  = a->maxallocrow;
2049   c->rlenmax      = a->rlenmax;
2050   c->nz           = a->nz;
2051   C->preallocated = PETSC_TRUE;
2052 
2053   c->nonzerorowcnt = a->nonzerorowcnt;
2054   C->nonzerostate  = A->nonzerostate;
2055 
2056   ierr = PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist);CHKERRQ(ierr);
2057   PetscFunctionReturn(0);
2058 }
2059 
2060 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2061 {
2062   PetscErrorCode ierr;
2063 
2064   PetscFunctionBegin;
2065   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
2066   ierr = MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);CHKERRQ(ierr);
2067   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2068     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
2069   }
2070   ierr = MatSetType(*B,((PetscObject)A)->type_name);CHKERRQ(ierr);
2071   ierr = MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE);CHKERRQ(ierr);
2072   PetscFunctionReturn(0);
2073 }
2074 
2075 /*@C
2076  MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2077 
2078  Collective on MPI_Comm
2079 
2080  Input Parameters:
2081  +  comm - MPI communicator, set to PETSC_COMM_SELF
2082  .  m - number of rows
2083  .  n - number of columns
2084  .  rlenmax - maximum number of nonzeros in a row
2085  -  rlen - array containing the number of nonzeros in the various rows
2086  (possibly different for each row) or NULL
2087 
2088  Output Parameter:
2089  .  A - the matrix
2090 
2091  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2092  MatXXXXSetPreallocation() paradgm instead of this routine directly.
2093  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2094 
2095  Notes:
2096  If nnz is given then nz is ignored
2097 
2098  Specify the preallocated storage with either rlenmax or rlen (not both).
2099  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2100  allocation.  For large problems you MUST preallocate memory or you
2101  will get TERRIBLE performance, see the users' manual chapter on matrices.
2102 
2103  Level: intermediate
2104 
2105  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatCreateSeqSELLWithArrays()
2106 
2107  @*/
2108 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2109 {
2110   PetscErrorCode ierr;
2111 
2112   PetscFunctionBegin;
2113   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2114   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2115   ierr = MatSetType(*A,MATSEQSELL);CHKERRQ(ierr);
2116   ierr = MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen);CHKERRQ(ierr);
2117   PetscFunctionReturn(0);
2118 }
2119 
2120 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2121 {
2122   Mat_SeqSELL     *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2123   PetscInt       totalslices=a->totalslices;
2124   PetscErrorCode ierr;
2125 
2126   PetscFunctionBegin;
2127   /* If the  matrix dimensions are not equal,or no of nonzeros */
2128   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2129     *flg = PETSC_FALSE;
2130     PetscFunctionReturn(0);
2131   }
2132   /* if the a->colidx are the same */
2133   ierr = PetscMemcmp(a->colidx,b->colidx,a->sliidx[totalslices]*sizeof(PetscInt),flg);CHKERRQ(ierr);
2134   if (!*flg) PetscFunctionReturn(0);
2135   /* if a->val are the same */
2136   ierr = PetscMemcmp(a->val,b->val,a->sliidx[totalslices]*sizeof(PetscScalar),flg);CHKERRQ(ierr);
2137   PetscFunctionReturn(0);
2138 }
2139 
2140 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2141 {
2142   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2143 
2144   PetscFunctionBegin;
2145   a->idiagvalid  = PETSC_FALSE;
2146   PetscFunctionReturn(0);
2147 }
2148 
2149 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2150 {
2151 #if defined(PETSC_USE_COMPLEX)
2152   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2153   PetscInt    i;
2154   PetscScalar *val = a->val;
2155 
2156   PetscFunctionBegin;
2157   for (i=0; i<a->sliidx[a->totalslices]; i++) {
2158     val[i] = PetscConj(val[i]);
2159   }
2160 #else
2161   PetscFunctionBegin;
2162 #endif
2163   PetscFunctionReturn(0);
2164 }
2165