xref: /petsc/src/mat/impls/sell/seq/sell.c (revision 9fc323658beecb572646cde2a15ce26a38213e75)
1 
2 /*
3   Defines the basic matrix operations for the SELL matrix storage format.
4 */
5 #include <../src/mat/impls/sell/seq/sell.h>  /*I   "petscmat.h"  I*/
6 #include <petscblaslapack.h>
7 #include <petsc/private/kernels/blocktranspose.h>
8 #if defined(PETSC_HAVE_IMMINTRIN_H)
9   #include <immintrin.h>
10 
11   #if !defined(_MM_SCALE_8)
12   #define _MM_SCALE_8    8
13   #endif
14 
15   #if defined(__AVX512F__)
16   /* these do not work
17    vec_idx  = _mm512_loadunpackhi_epi32(vec_idx,acolidx);
18    vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval);
19   */
20     #define AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
21     /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \
22     vec_idx  = _mm256_load_si256((__m256i const*)acolidx); \
23     vec_vals = _mm512_load_pd(aval); \
24     vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); \
25     vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y)
26   #elif defined(__AVX2__)
27     #define AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y) \
28     vec_vals = _mm256_load_pd(aval); \
29     vec_idx  =  _mm_load_si128((__m128i const*)acolidx); /* SSE2 */ \
30     vec_x    = _mm256_i32gather_pd(x,vec_idx,_MM_SCALE_8); \
31     vec_y    = _mm256_fmadd_pd(vec_x,vec_vals,vec_y)
32   #endif
33 #endif  /* PETSC_HAVE_IMMINTRIN_H */
34 
35 /*@C
36  MatSeqSELLSetPreallocation - For good matrix assembly performance
37  the user should preallocate the matrix storage by setting the parameter nz
38  (or the array nnz).  By setting these parameters accurately, performance
39  during matrix assembly can be increased significantly.
40 
41  Collective on MPI_Comm
42 
43  Input Parameters:
44  +  B - The matrix
45  .  nz - number of nonzeros per row (same for all rows)
46  -  nnz - array containing the number of nonzeros in the various rows
47  (possibly different for each row) or NULL
48 
49  Notes:
50  If nnz is given then nz is ignored.
51 
52  Specify the preallocated storage with either nz or nnz (not both).
53  Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
54  allocation.  For large problems you MUST preallocate memory or you
55  will get TERRIBLE performance, see the users' manual chapter on matrices.
56 
57  You can call MatGetInfo() to get information on how effective the preallocation was;
58  for example the fields mallocs,nz_allocated,nz_used,nz_unneeded;
59  You can also run with the option -info and look for messages with the string
60  malloc in them to see if additional memory allocation was needed.
61 
62  Developers: Use nz of MAT_SKIP_ALLOCATION to not allocate any space for the matrix
63  entries or columns indices.
64 
65  The maximum number of nonzeos in any row should be as accuate as possible.
66  If it is underesitmated, you will get bad performance due to reallocation
67  (MatSeqXSELLReallocateSELL).
68 
69  Level: intermediate
70 
71  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatGetInfo()
72 
73  @*/
74 PetscErrorCode MatSeqSELLSetPreallocation(Mat B,PetscInt rlenmax,const PetscInt rlen[])
75 {
76   PetscErrorCode ierr;
77 
78   PetscFunctionBegin;
79   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
80   PetscValidType(B,1);
81   ierr = PetscTryMethod(B,"MatSeqSELLSetPreallocation_C",(Mat,PetscInt,const PetscInt[]),(B,rlenmax,rlen));CHKERRQ(ierr);
82   PetscFunctionReturn(0);
83 }
84 
85 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B,PetscInt maxallocrow,const PetscInt rlen[])
86 {
87   Mat_SeqSELL    *b;
88   PetscInt       i,j,totalslices;
89   PetscBool      skipallocation=PETSC_FALSE,realalloc=PETSC_FALSE;
90   PetscErrorCode ierr;
91 
92   PetscFunctionBegin;
93   if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE;
94   if (maxallocrow == MAT_SKIP_ALLOCATION) {
95     skipallocation = PETSC_TRUE;
96     maxallocrow    = 0;
97   }
98 
99   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
100   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
101 
102   /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */
103   if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5;
104   if (maxallocrow < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"maxallocrow cannot be less than 0: value %D",maxallocrow);
105   if (rlen) {
106     for (i=0; i<B->rmap->n; i++) {
107       if (rlen[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be less than 0: local row %D value %D",i,rlen[i]);
108       if (rlen[i] > B->cmap->n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"rlen cannot be greater than row length: local row %D value %D rowlength %D",i,rlen[i],B->cmap->n);
109     }
110   }
111 
112   B->preallocated = PETSC_TRUE;
113 
114   b = (Mat_SeqSELL*)B->data;
115 
116   totalslices = B->rmap->n/8+((B->rmap->n & 0x07)?1:0); /* ceil(n/8) */
117   b->totalslices = totalslices;
118   if (!skipallocation) {
119     if (B->rmap->n & 0x07) PetscInfo1(B,"Padding rows to the SEQSELL matrix because the number of rows is not the multiple of 8 (value %D)\n",B->rmap->n);
120 
121     if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */
122       ierr = PetscMalloc1(totalslices+1,&b->sliidx);CHKERRQ(ierr);
123       ierr = PetscLogObjectMemory((PetscObject)B,(totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
124     }
125     if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */
126       if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10;
127       else if (maxallocrow < 0) maxallocrow = 1;
128       for (i=0; i<=totalslices; i++) b->sliidx[i] = i*8*maxallocrow;
129     } else {
130       maxallocrow = 0;
131       b->sliidx[0] = 0;
132       for (i=1; i<totalslices; i++) {
133         b->sliidx[i] = 0;
134         for (j=0;j<8;j++) {
135           b->sliidx[i] = PetscMax(b->sliidx[i],rlen[8*(i-1)+j]);
136         }
137         maxallocrow = PetscMax(b->sliidx[i],maxallocrow);
138         b->sliidx[i] = b->sliidx[i-1] + 8*b->sliidx[i];
139       }
140       /* last slice */
141       b->sliidx[totalslices] = 0;
142       for (j=(totalslices-1)*8;j<B->rmap->n;j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices],rlen[j]);
143       maxallocrow = PetscMax(b->sliidx[totalslices],maxallocrow);
144       b->sliidx[totalslices] = b->sliidx[totalslices-1] + 8*b->sliidx[totalslices];
145     }
146 
147     /* allocate space for val, colidx, rlen */
148     /* FIXME: should B's old memory be unlogged? */
149     ierr = MatSeqXSELLFreeSELL(B,&b->val,&b->colidx);CHKERRQ(ierr);
150     /* FIXME: assuming an element of the bit array takes 8 bits */
151     ierr = PetscMalloc2(b->sliidx[totalslices],&b->val,b->sliidx[totalslices],&b->colidx);CHKERRQ(ierr);
152     ierr = PetscLogObjectMemory((PetscObject)B,b->sliidx[totalslices]*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
153     /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */
154     ierr = PetscCalloc1(8*totalslices,&b->rlen);CHKERRQ(ierr);
155     ierr = PetscLogObjectMemory((PetscObject)B,8*totalslices*sizeof(PetscInt));CHKERRQ(ierr);
156 
157     b->singlemalloc = PETSC_TRUE;
158     b->free_val     = PETSC_TRUE;
159     b->free_colidx  = PETSC_TRUE;
160   } else {
161     b->free_val    = PETSC_FALSE;
162     b->free_colidx = PETSC_FALSE;
163   }
164 
165   b->nz               = 0;
166   b->maxallocrow      = maxallocrow;
167   b->rlenmax          = maxallocrow;
168   b->maxallocmat      = b->sliidx[totalslices];
169   B->info.nz_unneeded = (double)b->maxallocmat;
170   if (realalloc) {
171     ierr = MatSetOption(B,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
172   }
173   PetscFunctionReturn(0);
174 }
175 
176 PetscErrorCode MatGetRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
177 {
178   Mat_SeqSELL *a = (Mat_SeqSELL*)A->data;
179   PetscInt    shift;
180 
181   PetscFunctionBegin;
182   if (row < 0 || row >= A->rmap->n) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row %D out of range",row);
183   if (nz) *nz = a->rlen[row];
184   shift = a->sliidx[row>>3]+(row&0x07);
185   if (!a->getrowcols) {
186     PetscErrorCode ierr;
187 
188     ierr = PetscMalloc2(a->rlenmax,&a->getrowcols,a->rlenmax,&a->getrowvals);CHKERRQ(ierr);
189   }
190   if (idx) {
191     PetscInt j;
192     for (j=0; j<a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift+8*j];
193     *idx = a->getrowcols;
194   }
195   if (v) {
196     PetscInt j;
197     for (j=0; j<a->rlen[row]; j++) a->getrowvals[j] = a->val[shift+8*j];
198     *v = a->getrowvals;
199   }
200   PetscFunctionReturn(0);
201 }
202 
203 PetscErrorCode MatRestoreRow_SeqSELL(Mat A,PetscInt row,PetscInt *nz,PetscInt **idx,PetscScalar **v)
204 {
205   PetscFunctionBegin;
206   PetscFunctionReturn(0);
207 }
208 
209 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype,MatReuse reuse,Mat *newmat)
210 {
211   Mat            B;
212   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
213   PetscInt       i;
214   PetscErrorCode ierr;
215 
216   PetscFunctionBegin;
217   if (reuse == MAT_REUSE_MATRIX) {
218     B    = *newmat;
219     ierr = MatZeroEntries(B);CHKERRQ(ierr);
220   } else {
221     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
222     ierr = MatSetSizes(B,A->rmap->n,A->cmap->n,A->rmap->N,A->cmap->N);CHKERRQ(ierr);
223     ierr = MatSetType(B,MATSEQAIJ);CHKERRQ(ierr);
224     ierr = MatSeqAIJSetPreallocation(B,0,a->rlen);CHKERRQ(ierr);
225   }
226 
227   for (i=0; i<A->rmap->n; i++) {
228     PetscInt    nz,*cols;
229     PetscScalar *vals;
230 
231     ierr = MatGetRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
232     ierr = MatSetValues(B,1,&i,nz,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
233     ierr = MatRestoreRow_SeqSELL(A,i,&nz,&cols,&vals);CHKERRQ(ierr);
234   }
235 
236   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
237   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
238   B->rmap->bs = A->rmap->bs;
239 
240   if (reuse == MAT_INPLACE_MATRIX) {
241     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
242   } else {
243     *newmat = B;
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 #include <../src/mat/impls/aij/seq/aij.h>
249 
250 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A,MatType newtype,MatReuse reuse,Mat *newmat)
251 {
252   Mat               B;
253   Mat_SeqAIJ        *a=(Mat_SeqAIJ*)A->data;
254   PetscInt          *ai=a->i,m=A->rmap->N,n=A->cmap->N,i,*rowlengths,row,ncols;
255   const PetscInt    *cols;
256   const PetscScalar *vals;
257   PetscErrorCode    ierr;
258 
259   PetscFunctionBegin;
260   if (A->rmap->bs > 1) {
261     ierr = MatConvert_Basic(A,newtype,reuse,newmat);CHKERRQ(ierr);
262     PetscFunctionReturn(0);
263   }
264 
265   if (reuse == MAT_REUSE_MATRIX) {
266     B = *newmat;
267   } else {
268     /* Can we just use ilen? */
269     ierr = PetscMalloc1(m,&rowlengths);CHKERRQ(ierr);
270     for (i=0; i<m; i++) {
271       rowlengths[i] = ai[i+1] - ai[i];
272     }
273 
274     ierr = MatCreate(PetscObjectComm((PetscObject)A),&B);CHKERRQ(ierr);
275     ierr = MatSetSizes(B,m,n,m,n);CHKERRQ(ierr);
276     ierr = MatSetType(B,MATSEQSELL);CHKERRQ(ierr);
277     ierr = MatSeqSELLSetPreallocation(B,0,rowlengths);CHKERRQ(ierr);
278     ierr = PetscFree(rowlengths);CHKERRQ(ierr);
279   }
280 
281   for (row=0; row<m; row++) {
282     ierr = MatGetRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
283     ierr = MatSetValues(B,1,&row,ncols,cols,vals,INSERT_VALUES);CHKERRQ(ierr);
284     ierr = MatRestoreRow(A,row,&ncols,&cols,&vals);CHKERRQ(ierr);
285   }
286   ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
287   ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
288   B->rmap->bs = A->rmap->bs;
289 
290   if (reuse == MAT_INPLACE_MATRIX) {
291     ierr = MatHeaderReplace(A,&B);CHKERRQ(ierr);
292   } else {
293     *newmat = B;
294   }
295   PetscFunctionReturn(0);
296 }
297 
298 PetscErrorCode MatMult_SeqSELL(Mat A,Vec xx,Vec yy)
299 {
300   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
301   PetscScalar       *y;
302   const PetscScalar *x;
303   const MatScalar   *aval=a->val;
304   PetscInt          totalslices=a->totalslices;
305   const PetscInt    *acolidx=a->colidx;
306   PetscInt          i,j;
307   PetscErrorCode    ierr;
308 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
309   __m512d           vec_x,vec_y,vec_vals;
310   __m256i           vec_idx;
311   __mmask8          mask;
312   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
313   __m256i           vec_idx2,vec_idx3,vec_idx4;
314 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
315   __m128i           vec_idx;
316   __m256d           vec_x,vec_y,vec_y2,vec_vals;
317   MatScalar         yval;
318   PetscInt          r,rows_left,row,nnz_in_row;
319 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
320   __m128d           vec_x_tmp;
321   __m256d           vec_x,vec_y,vec_y2,vec_vals;
322   MatScalar         yval;
323   PetscInt          r,rows_left,row,nnz_in_row;
324 #else
325   PetscScalar       sum[8];
326 #endif
327 
328 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
329 #pragma disjoint(*x,*y,*aval)
330 #endif
331 
332   PetscFunctionBegin;
333   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
334   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
335 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
336   for (i=0; i<totalslices; i++) { /* loop over slices */
337     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
338     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
339 
340     vec_y  = _mm512_setzero_pd();
341     vec_y2 = _mm512_setzero_pd();
342     vec_y3 = _mm512_setzero_pd();
343     vec_y4 = _mm512_setzero_pd();
344 
345     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
346     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
347     case 3:
348       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
349       acolidx += 8; aval += 8;
350       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
351       acolidx += 8; aval += 8;
352       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
353       acolidx += 8; aval += 8;
354       j += 3;
355       break;
356     case 2:
357       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
358       acolidx += 8; aval += 8;
359       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
360       acolidx += 8; aval += 8;
361       j += 2;
362       break;
363     case 1:
364       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
365       acolidx += 8; aval += 8;
366       j += 1;
367       break;
368     }
369     #pragma novector
370     for (; j<(a->sliidx[i+1]>>3); j+=4) {
371       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
372       acolidx += 8; aval += 8;
373       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
374       acolidx += 8; aval += 8;
375       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
376       acolidx += 8; aval += 8;
377       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
378       acolidx += 8; aval += 8;
379     }
380 
381     vec_y = _mm512_add_pd(vec_y,vec_y2);
382     vec_y = _mm512_add_pd(vec_y,vec_y3);
383     vec_y = _mm512_add_pd(vec_y,vec_y4);
384     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
385       mask = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
386       _mm512_mask_store_pd(&y[8*i],mask,vec_y);
387     } else {
388       _mm512_store_pd(&y[8*i],vec_y);
389     }
390   }
391 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
392   for (i=0; i<totalslices; i++) { /* loop over full slices */
393     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
394     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
395 
396     /* last slice may have padding rows. Don't use vectorization. */
397     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
398       rows_left = A->rmap->n - 8*i;
399       for (r=0; r<rows_left; ++r) {
400         yval = (MatScalar)0;
401         row = 8*i + r;
402         nnz_in_row = a->rlen[row];
403         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
404         y[row] = yval;
405       }
406       break;
407     }
408 
409     vec_y  = _mm256_setzero_pd();
410     vec_y2 = _mm256_setzero_pd();
411 
412     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
413     #pragma novector
414     #pragma unroll(2)
415     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
416       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
417       aval += 4; acolidx += 4;
418       AVX2_Mult_Private(vec_idx,vec_x,vec_vals,vec_y2);
419       aval += 4; acolidx += 4;
420     }
421 
422     _mm256_store_pd(y+i*8,vec_y);
423     _mm256_store_pd(y+i*8+4,vec_y2);
424   }
425 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
426   for (i=0; i<totalslices; i++) { /* loop over full slices */
427     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
428     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
429 
430     vec_y  = _mm256_setzero_pd();
431     vec_y2 = _mm256_setzero_pd();
432 
433     /* last slice may have padding rows. Don't use vectorization. */
434     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
435       rows_left = A->rmap->n - 8*i;
436       for (r=0; r<rows_left; ++r) {
437         yval = (MatScalar)0;
438         row = 8*i + r;
439         nnz_in_row = a->rlen[row];
440         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j + r] * x[acolidx[8*j + r]];
441         y[row] = yval;
442       }
443       break;
444     }
445 
446     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
447     #pragma novector
448     #pragma unroll(2)
449     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
450       vec_vals  = _mm256_loadu_pd(aval);
451       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
452       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
453       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
454       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
455       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
456       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
457       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
458       aval     += 4;
459 
460       vec_vals  = _mm256_loadu_pd(aval);
461       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
462       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
463       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
464       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
465       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
466       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
467       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
468       aval     += 4;
469     }
470 
471     _mm256_storeu_pd(y + i*8,     vec_y);
472     _mm256_storeu_pd(y + i*8 + 4, vec_y2);
473   }
474 #else
475   for (i=0; i<totalslices; i++) { /* loop over slices */
476     for (j=0; j<8; j++) sum[j] = 0.0;
477     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
478       sum[0] += aval[j] * x[acolidx[j]];
479       sum[1] += aval[j+1] * x[acolidx[j+1]];
480       sum[2] += aval[j+2] * x[acolidx[j+2]];
481       sum[3] += aval[j+3] * x[acolidx[j+3]];
482       sum[4] += aval[j+4] * x[acolidx[j+4]];
483       sum[5] += aval[j+5] * x[acolidx[j+5]];
484       sum[6] += aval[j+6] * x[acolidx[j+6]];
485       sum[7] += aval[j+7] * x[acolidx[j+7]];
486     }
487     if (i == totalslices-1 && (A->rmap->n & 0x07)) { /* if last slice has padding rows */
488       for(j=0; j<(A->rmap->n & 0x07); j++) y[8*i+j] = sum[j];
489     } else {
490       for(j=0; j<8; j++) y[8*i+j] = sum[j];
491     }
492   }
493 #endif
494 
495   ierr = PetscLogFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); /* theoretical minimal FLOPs */
496   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
497   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
498   PetscFunctionReturn(0);
499 }
500 
501 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h>
502 PetscErrorCode MatMultAdd_SeqSELL(Mat A,Vec xx,Vec yy,Vec zz)
503 {
504   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
505   PetscScalar       *y,*z;
506   const PetscScalar *x;
507   const MatScalar   *aval=a->val;
508   PetscInt          totalslices=a->totalslices;
509   const PetscInt    *acolidx=a->colidx;
510   PetscInt          i,j;
511   PetscErrorCode    ierr;
512 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
513   __m512d           vec_x,vec_y,vec_vals;
514   __m256i           vec_idx;
515   __mmask8          mask;
516   __m512d           vec_x2,vec_y2,vec_vals2,vec_x3,vec_y3,vec_vals3,vec_x4,vec_y4,vec_vals4;
517   __m256i           vec_idx2,vec_idx3,vec_idx4;
518 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
519   __m128d           vec_x_tmp;
520   __m256d           vec_x,vec_y,vec_y2,vec_vals;
521   MatScalar         yval;
522   PetscInt          r,row,nnz_in_row;
523 #else
524   PetscScalar       sum[8];
525 #endif
526 
527 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
528 #pragma disjoint(*x,*y,*aval)
529 #endif
530 
531   PetscFunctionBegin;
532   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
533   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
534 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
535   for (i=0; i<totalslices; i++) { /* loop over slices */
536     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
537     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
538 
539     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
540       mask   = (__mmask8)(0xff >> (8-(A->rmap->n & 0x07)));
541       vec_y  = _mm512_mask_load_pd(vec_y,mask,&y[8*i]);
542     } else {
543       vec_y  = _mm512_load_pd(&y[8*i]);
544     }
545     vec_y2 = _mm512_setzero_pd();
546     vec_y3 = _mm512_setzero_pd();
547     vec_y4 = _mm512_setzero_pd();
548 
549     j = a->sliidx[i]>>3; /* 8 bytes are read at each time, corresponding to a slice columnn */
550     switch ((a->sliidx[i+1]-a->sliidx[i])/8 & 3) {
551     case 3:
552       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
553       acolidx += 8; aval += 8;
554       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
555       acolidx += 8; aval += 8;
556       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
557       acolidx += 8; aval += 8;
558       j += 3;
559       break;
560     case 2:
561       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
562       acolidx += 8; aval += 8;
563       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
564       acolidx += 8; aval += 8;
565       j += 2;
566       break;
567     case 1:
568       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
569       acolidx += 8; aval += 8;
570       j += 1;
571       break;
572     }
573     #pragma novector
574     for (; j<(a->sliidx[i+1]>>3); j+=4) {
575       AVX512_Mult_Private(vec_idx,vec_x,vec_vals,vec_y);
576       acolidx += 8; aval += 8;
577       AVX512_Mult_Private(vec_idx2,vec_x2,vec_vals2,vec_y2);
578       acolidx += 8; aval += 8;
579       AVX512_Mult_Private(vec_idx3,vec_x3,vec_vals3,vec_y3);
580       acolidx += 8; aval += 8;
581       AVX512_Mult_Private(vec_idx4,vec_x4,vec_vals4,vec_y4);
582       acolidx += 8; aval += 8;
583     }
584 
585     vec_y = _mm512_add_pd(vec_y,vec_y2);
586     vec_y = _mm512_add_pd(vec_y,vec_y3);
587     vec_y = _mm512_add_pd(vec_y,vec_y4);
588     if (i == totalslices-1 && A->rmap->n & 0x07) { /* if last slice has padding rows */
589       _mm512_mask_store_pd(&z[8*i],mask,vec_y);
590     } else {
591       _mm512_store_pd(&z[8*i],vec_y);
592     }
593   }
594 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX)
595   for (i=0; i<totalslices; i++) { /* loop over full slices */
596     PetscPrefetchBlock(acolidx,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
597     PetscPrefetchBlock(aval,a->sliidx[i+1]-a->sliidx[i],0,PETSC_PREFETCH_HINT_T0);
598 
599     /* last slice may have padding rows. Don't use vectorization. */
600     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
601       for (r=0; r<(A->rmap->n & 0x07); ++r) {
602         row        = 8*i + r;
603         yval       = (MatScalar)0.0;
604         nnz_in_row = a->rlen[row];
605         for (j=0; j<nnz_in_row; ++j) yval += aval[8*j+r] * x[acolidx[8*j+r]];
606         z[row] = y[row] + yval;
607       }
608       break;
609     }
610 
611     vec_y  = _mm256_loadu_pd(y+8*i);
612     vec_y2 = _mm256_loadu_pd(y+8*i+4);
613 
614     /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */
615     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
616       vec_vals  = _mm256_loadu_pd(aval);
617       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
618       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
619       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
620       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
621       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
622       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
623       vec_y     = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y);
624       aval     += 4;
625 
626       vec_vals  = _mm256_loadu_pd(aval);
627       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
628       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
629       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,0);
630       vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++);
631       vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++);
632       vec_x     = _mm256_insertf128_pd(vec_x,vec_x_tmp,1);
633       vec_y2    = _mm256_add_pd(_mm256_mul_pd(vec_x,vec_vals),vec_y2);
634       aval     += 4;
635     }
636 
637     _mm256_storeu_pd(z+i*8,vec_y);
638     _mm256_storeu_pd(z+i*8+4,vec_y2);
639   }
640 #else
641   for (i=0; i<totalslices; i++) { /* loop over slices */
642     for (j=0; j<8; j++) sum[j] = 0.0;
643     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
644       sum[0] += aval[j] * x[acolidx[j]];
645       sum[1] += aval[j+1] * x[acolidx[j+1]];
646       sum[2] += aval[j+2] * x[acolidx[j+2]];
647       sum[3] += aval[j+3] * x[acolidx[j+3]];
648       sum[4] += aval[j+4] * x[acolidx[j+4]];
649       sum[5] += aval[j+5] * x[acolidx[j+5]];
650       sum[6] += aval[j+6] * x[acolidx[j+6]];
651       sum[7] += aval[j+7] * x[acolidx[j+7]];
652     }
653     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
654       for (j=0; j<(A->rmap->n & 0x07); j++) z[8*i+j] = y[8*i+j] + sum[j];
655     } else {
656       for (j=0; j<8; j++) z[8*i+j] = y[8*i+j] + sum[j];
657     }
658   }
659 #endif
660 
661   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
662   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
663   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
664   PetscFunctionReturn(0);
665 }
666 
667 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A,Vec xx,Vec zz,Vec yy)
668 {
669   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
670   PetscScalar       *y;
671   const PetscScalar *x;
672   const MatScalar   *aval=a->val;
673   const PetscInt    *acolidx=a->colidx;
674   PetscInt          i,j,r,row,nnz_in_row,totalslices=a->totalslices;
675   PetscErrorCode    ierr;
676 
677 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
678 #pragma disjoint(*x,*y,*aval)
679 #endif
680 
681   PetscFunctionBegin;
682   if (A->symmetric) {
683     ierr = MatMultAdd_SeqSELL(A,xx,zz,yy);CHKERRQ(ierr);
684     PetscFunctionReturn(0);
685   }
686   if (zz != yy) { ierr = VecCopy(zz,yy);CHKERRQ(ierr); }
687   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
688   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
689   for (i=0; i<a->totalslices; i++) { /* loop over slices */
690     if (i == totalslices-1 && (A->rmap->n & 0x07)) {
691       for (r=0; r<(A->rmap->n & 0x07); ++r) {
692         row        = 8*i + r;
693         nnz_in_row = a->rlen[row];
694         for (j=0; j<nnz_in_row; ++j) y[acolidx[8*j+r]] += aval[8*j+r] * x[row];
695       }
696       break;
697     }
698     for (j=a->sliidx[i]; j<a->sliidx[i+1]; j+=8) {
699       y[acolidx[j]]   += aval[j] * x[8*i];
700       y[acolidx[j+1]] += aval[j+1] * x[8*i+1];
701       y[acolidx[j+2]] += aval[j+2] * x[8*i+2];
702       y[acolidx[j+3]] += aval[j+3] * x[8*i+3];
703       y[acolidx[j+4]] += aval[j+4] * x[8*i+4];
704       y[acolidx[j+5]] += aval[j+5] * x[8*i+5];
705       y[acolidx[j+6]] += aval[j+6] * x[8*i+6];
706       y[acolidx[j+7]] += aval[j+7] * x[8*i+7];
707     }
708   }
709   ierr = PetscLogFlops(2.0*a->sliidx[a->totalslices]);CHKERRQ(ierr);
710   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
711   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
712   PetscFunctionReturn(0);
713 }
714 
715 PetscErrorCode MatMultTranspose_SeqSELL(Mat A,Vec xx,Vec yy)
716 {
717   PetscErrorCode ierr;
718 
719   PetscFunctionBegin;
720   if (A->symmetric) {
721     ierr = MatMult_SeqSELL(A,xx,yy);CHKERRQ(ierr);
722   } else {
723     ierr = VecSet(yy,0.0);CHKERRQ(ierr);
724     ierr = MatMultTransposeAdd_SeqSELL(A,xx,yy,yy);CHKERRQ(ierr);
725   }
726   PetscFunctionReturn(0);
727 }
728 
729 /*
730      Checks for missing diagonals
731 */
732 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A,PetscBool  *missing,PetscInt *d)
733 {
734   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
735   PetscInt    *diag,i;
736 
737   PetscFunctionBegin;
738   *missing = PETSC_FALSE;
739   if (A->rmap->n > 0 && !(a->colidx)) {
740     *missing = PETSC_TRUE;
741     if (d) *d = 0;
742     PetscInfo(A,"Matrix has no entries therefore is missing diagonal\n");
743   } else {
744     diag = a->diag;
745     for (i=0; i<A->rmap->n; i++) {
746       if (diag[i] == -1) {
747         *missing = PETSC_TRUE;
748         if (d) *d = i;
749         PetscInfo1(A,"Matrix is missing diagonal number %D\n",i);
750         break;
751       }
752     }
753   }
754   PetscFunctionReturn(0);
755 }
756 
757 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A)
758 {
759   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
760   PetscInt       i,j,m=A->rmap->n,shift;
761   PetscErrorCode ierr;
762 
763   PetscFunctionBegin;
764   if (!a->diag) {
765     ierr         = PetscMalloc1(m,&a->diag);CHKERRQ(ierr);
766     ierr         = PetscLogObjectMemory((PetscObject)A,m*sizeof(PetscInt));CHKERRQ(ierr);
767     a->free_diag = PETSC_TRUE;
768   }
769   for (i=0; i<m; i++) { /* loop over rows */
770     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
771     a->diag[i] = -1;
772     for (j=0; j<a->rlen[i]; j++) {
773       if (a->colidx[shift+j*8] == i) {
774         a->diag[i] = shift+j*8;
775         break;
776       }
777     }
778   }
779   PetscFunctionReturn(0);
780 }
781 
782 /*
783   Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
784 */
785 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A,PetscScalar omega,PetscScalar fshift)
786 {
787   Mat_SeqSELL    *a=(Mat_SeqSELL*) A->data;
788   PetscInt       i,*diag,m = A->rmap->n;
789   MatScalar      *val = a->val;
790   PetscScalar    *idiag,*mdiag;
791   PetscErrorCode ierr;
792 
793   PetscFunctionBegin;
794   if (a->idiagvalid) PetscFunctionReturn(0);
795   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
796   diag = a->diag;
797   if (!a->idiag) {
798     ierr = PetscMalloc3(m,&a->idiag,m,&a->mdiag,m,&a->ssor_work);CHKERRQ(ierr);
799     ierr = PetscLogObjectMemory((PetscObject)A, 3*m*sizeof(PetscScalar));CHKERRQ(ierr);
800     val  = a->val;
801   }
802   mdiag = a->mdiag;
803   idiag = a->idiag;
804 
805   if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) {
806     for (i=0; i<m; i++) {
807       mdiag[i] = val[diag[i]];
808       if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */
809         if (PetscRealPart(fshift)) {
810           ierr = PetscInfo1(A,"Zero diagonal on row %D\n",i);CHKERRQ(ierr);
811           A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
812           A->factorerror_zeropivot_value = 0.0;
813           A->factorerror_zeropivot_row   = i;
814         } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Zero diagonal on row %D",i);
815       }
816       idiag[i] = 1.0/val[diag[i]];
817     }
818     ierr = PetscLogFlops(m);CHKERRQ(ierr);
819   } else {
820     for (i=0; i<m; i++) {
821       mdiag[i] = val[diag[i]];
822       idiag[i] = omega/(fshift + val[diag[i]]);
823     }
824     ierr = PetscLogFlops(2.0*m);CHKERRQ(ierr);
825   }
826   a->idiagvalid = PETSC_TRUE;
827   PetscFunctionReturn(0);
828 }
829 
830 PetscErrorCode MatZeroEntries_SeqSELL(Mat A)
831 {
832   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
833   PetscErrorCode ierr;
834 
835   PetscFunctionBegin;
836   ierr = PetscMemzero(a->val,(a->sliidx[a->totalslices])*sizeof(PetscScalar));CHKERRQ(ierr);
837   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
838   PetscFunctionReturn(0);
839 }
840 
841 PetscErrorCode MatDestroy_SeqSELL(Mat A)
842 {
843   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
844   PetscErrorCode ierr;
845 
846   PetscFunctionBegin;
847 #if defined(PETSC_USE_LOG)
848   PetscLogObjectState((PetscObject)A,"Rows=%D, Cols=%D, NZ=%D",A->rmap->n,A->cmap->n,a->nz);
849 #endif
850   ierr = MatSeqXSELLFreeSELL(A,&a->val,&a->colidx);CHKERRQ(ierr);
851   ierr = ISDestroy(&a->row);CHKERRQ(ierr);
852   ierr = ISDestroy(&a->col);CHKERRQ(ierr);
853   ierr = PetscFree(a->diag);CHKERRQ(ierr);
854   ierr = PetscFree(a->ibdiag);CHKERRQ(ierr);
855   ierr = PetscFree(a->rlen);CHKERRQ(ierr);
856   ierr = PetscFree(a->sliidx);CHKERRQ(ierr);
857   ierr = PetscFree3(a->idiag,a->mdiag,a->ssor_work);CHKERRQ(ierr);
858   ierr = PetscFree(a->solve_work);CHKERRQ(ierr);
859   ierr = ISDestroy(&a->icol);CHKERRQ(ierr);
860   ierr = PetscFree(a->saved_values);CHKERRQ(ierr);
861   ierr = PetscFree2(a->getrowcols,a->getrowvals);CHKERRQ(ierr);
862 
863   ierr = PetscFree(A->data);CHKERRQ(ierr);
864 
865   ierr = PetscObjectChangeTypeName((PetscObject)A,0);CHKERRQ(ierr);
866   ierr = PetscObjectComposeFunction((PetscObject)A,"MatStoreValues_C",NULL);CHKERRQ(ierr);
867   ierr = PetscObjectComposeFunction((PetscObject)A,"MatRetrieveValues_C",NULL);CHKERRQ(ierr);
868 #if defined(PETSC_HAVE_ELEMENTAL)
869 #endif
870   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqSELLSetPreallocation_C",NULL);CHKERRQ(ierr);
871   PetscFunctionReturn(0);
872 }
873 
874 PetscErrorCode MatSetOption_SeqSELL(Mat A,MatOption op,PetscBool flg)
875 {
876   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
877   PetscErrorCode ierr;
878 
879   PetscFunctionBegin;
880   switch (op) {
881   case MAT_ROW_ORIENTED:
882     a->roworiented = flg;
883     break;
884   case MAT_KEEP_NONZERO_PATTERN:
885     a->keepnonzeropattern = flg;
886     break;
887   case MAT_NEW_NONZERO_LOCATIONS:
888     a->nonew = (flg ? 0 : 1);
889     break;
890   case MAT_NEW_NONZERO_LOCATION_ERR:
891     a->nonew = (flg ? -1 : 0);
892     break;
893   case MAT_NEW_NONZERO_ALLOCATION_ERR:
894     a->nonew = (flg ? -2 : 0);
895     break;
896   case MAT_UNUSED_NONZERO_LOCATION_ERR:
897     a->nounused = (flg ? -1 : 0);
898     break;
899   case MAT_NEW_DIAGONALS:
900   case MAT_IGNORE_OFF_PROC_ENTRIES:
901   case MAT_USE_HASH_TABLE:
902     ierr = PetscInfo1(A,"Option %s ignored\n",MatOptions[op]);CHKERRQ(ierr);
903     break;
904   case MAT_SPD:
905   case MAT_SYMMETRIC:
906   case MAT_STRUCTURALLY_SYMMETRIC:
907   case MAT_HERMITIAN:
908   case MAT_SYMMETRY_ETERNAL:
909     /* These options are handled directly by MatSetOption() */
910     break;
911   default:
912     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unknown option %d",op);
913   }
914   PetscFunctionReturn(0);
915 }
916 
917 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A,Vec v)
918 {
919   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
920   PetscInt       i,j,n,shift;
921   PetscScalar    *x,zero=0.0;
922   PetscErrorCode ierr;
923 
924   PetscFunctionBegin;
925   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
926   if (n != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
927 
928   if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) {
929     PetscInt *diag=a->diag;
930     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
931     for (i=0; i<n; i++) x[i] = 1.0/a->val[diag[i]];
932     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
933     PetscFunctionReturn(0);
934   }
935 
936   ierr = VecSet(v,zero);CHKERRQ(ierr);
937   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
938   for (i=0; i<n; i++) { /* loop over rows */
939     shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
940     x[i] = 0;
941     for (j=0; j<a->rlen[i]; j++) {
942       if (a->colidx[shift+j*8] == i) {
943         x[i] = a->val[shift+j*8];
944         break;
945       }
946     }
947   }
948   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
949   PetscFunctionReturn(0);
950 }
951 
952 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A,Vec ll,Vec rr)
953 {
954   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
955   const PetscScalar *l,*r;
956   PetscInt          i,j,m,n,row;
957   PetscErrorCode    ierr;
958 
959   PetscFunctionBegin;
960   if (ll) {
961     /* The local size is used so that VecMPI can be passed to this routine
962        by MatDiagonalScale_MPISELL */
963     ierr = VecGetLocalSize(ll,&m);CHKERRQ(ierr);
964     if (m != A->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
965     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
966     for (i=0; i<a->totalslices; i++) { /* loop over slices */
967       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
968         a->val[j] *= l[8*i+row];
969       }
970     }
971     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
972     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
973   }
974   if (rr) {
975     ierr = VecGetLocalSize(rr,&n);CHKERRQ(ierr);
976     if (n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
977     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
978     for (i=0; i<a->totalslices; i++) { /* loop over slices */
979       for (j=a->sliidx[i]; j<a->sliidx[i+1]; j++) {
980         a->val[j] *= r[a->colidx[j]];
981       }
982     }
983     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
984     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
985   }
986   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
987   PetscFunctionReturn(0);
988 }
989 
990 extern PetscErrorCode MatSetValues_SeqSELL(Mat,PetscInt,const PetscInt[],PetscInt,const PetscInt[],const PetscScalar[],InsertMode);
991 
992 PetscErrorCode MatGetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],PetscScalar v[])
993 {
994   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
995   PetscInt    *cp,i,k,low,high,t,row,col,l;
996   PetscInt    shift;
997   MatScalar   *vp;
998 
999   PetscFunctionBegin;
1000   for (k=0; k<m; k++) { /* loop over requested rows */
1001     row = im[k];
1002     if (row<0) continue;
1003 #if defined(PETSC_USE_DEBUG)
1004     if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1005 #endif
1006     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1007     cp = a->colidx+shift; /* pointer to the row */
1008     vp = a->val+shift; /* pointer to the row */
1009     for (l=0; l<n; l++) { /* loop over requested columns */
1010       col = in[l];
1011       if (col<0) continue;
1012 #if defined(PETSC_USE_DEBUG)
1013       if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Column too large: row %D max %D",col,A->cmap->n-1);
1014 #endif
1015       high = a->rlen[row]; low = 0; /* assume unsorted */
1016       while (high-low > 5) {
1017         t = (low+high)/2;
1018         if (*(cp+t*8) > col) high = t;
1019         else low = t;
1020       }
1021       for (i=low; i<high; i++) {
1022         if (*(cp+8*i) > col) break;
1023         if (*(cp+8*i) == col) {
1024           *v++ = *(vp+8*i);
1025           goto finished;
1026         }
1027       }
1028       *v++ = 0.0;
1029     finished:;
1030     }
1031   }
1032   PetscFunctionReturn(0);
1033 }
1034 
1035 PetscErrorCode MatView_SeqSELL_ASCII(Mat A,PetscViewer viewer)
1036 {
1037   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1038   PetscInt          i,j,m=A->rmap->n,shift;
1039   const char        *name;
1040   PetscViewerFormat format;
1041   PetscErrorCode    ierr;
1042 
1043   PetscFunctionBegin;
1044   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1045   if (format == PETSC_VIEWER_ASCII_MATLAB) {
1046     PetscInt nofinalvalue = 0;
1047     /*
1048     if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) {
1049       nofinalvalue = 1;
1050     }
1051     */
1052     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1053     ierr = PetscViewerASCIIPrintf(viewer,"%% Size = %D %D \n",m,A->cmap->n);CHKERRQ(ierr);
1054     ierr = PetscViewerASCIIPrintf(viewer,"%% Nonzeros = %D \n",a->nz);CHKERRQ(ierr);
1055 #if defined(PETSC_USE_COMPLEX)
1056     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,4);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1057 #else
1058     ierr = PetscViewerASCIIPrintf(viewer,"zzz = zeros(%D,3);\n",a->nz+nofinalvalue);CHKERRQ(ierr);
1059 #endif
1060     ierr = PetscViewerASCIIPrintf(viewer,"zzz = [\n");CHKERRQ(ierr);
1061 
1062     for (i=0; i<m; i++) {
1063       shift = a->sliidx[i>>3]+(i&0x07);
1064       for (j=0; j<a->rlen[i]; j++) {
1065 #if defined(PETSC_USE_COMPLEX)
1066         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1067 #else
1068         ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",i+1,a->colidx[shift+8*j]+1,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1069 #endif
1070       }
1071     }
1072     /*
1073     if (nofinalvalue) {
1074 #if defined(PETSC_USE_COMPLEX)
1075       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e %18.16e\n",m,A->cmap->n,0.,0.);CHKERRQ(ierr);
1076 #else
1077       ierr = PetscViewerASCIIPrintf(viewer,"%D %D  %18.16e\n",m,A->cmap->n,0.0);CHKERRQ(ierr);
1078 #endif
1079     }
1080     */
1081     ierr = PetscObjectGetName((PetscObject)A,&name);CHKERRQ(ierr);
1082     ierr = PetscViewerASCIIPrintf(viewer,"];\n %s = spconvert(zzz);\n",name);CHKERRQ(ierr);
1083     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1084   } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) {
1085     PetscFunctionReturn(0);
1086   } else if (format == PETSC_VIEWER_ASCII_COMMON) {
1087     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1088     for (i=0; i<m; i++) {
1089       ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1090       shift = a->sliidx[i>>3]+(i&0x07);
1091       for (j=0; j<a->rlen[i]; j++) {
1092 #if defined(PETSC_USE_COMPLEX)
1093         if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1094           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1095         } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0 && PetscRealPart(a->val[shift+8*j]) != 0.0) {
1096           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1097         } else if (PetscRealPart(a->val[shift+8*j]) != 0.0) {
1098           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1099         }
1100 #else
1101         if (a->val[shift+8*j] != 0.0) {ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);}
1102 #endif
1103       }
1104       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1105     }
1106     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1107   } else if (format == PETSC_VIEWER_ASCII_DENSE) {
1108     PetscInt    cnt=0,jcnt;
1109     PetscScalar value;
1110 #if defined(PETSC_USE_COMPLEX)
1111     PetscBool   realonly=PETSC_TRUE;
1112     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1113       if (PetscImaginaryPart(a->val[i]) != 0.0) {
1114         realonly = PETSC_FALSE;
1115         break;
1116       }
1117     }
1118 #endif
1119 
1120     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1121     for (i=0; i<m; i++) {
1122       jcnt = 0;
1123       shift = a->sliidx[i>>3]+(i&0x07);
1124       for (j=0; j<A->cmap->n; j++) {
1125         if (jcnt < a->rlen[i] && j == a->colidx[shift+8*j]) {
1126           value = a->val[cnt++];
1127           jcnt++;
1128         } else {
1129           value = 0.0;
1130         }
1131 #if defined(PETSC_USE_COMPLEX)
1132         if (realonly) {
1133           ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)PetscRealPart(value));CHKERRQ(ierr);
1134         } else {
1135           ierr = PetscViewerASCIIPrintf(viewer," %7.5e+%7.5e i ",(double)PetscRealPart(value),(double)PetscImaginaryPart(value));CHKERRQ(ierr);
1136         }
1137 #else
1138         ierr = PetscViewerASCIIPrintf(viewer," %7.5e ",(double)value);CHKERRQ(ierr);
1139 #endif
1140       }
1141       ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1142     }
1143     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1144   } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) {
1145     PetscInt fshift=1;
1146     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1147 #if defined(PETSC_USE_COMPLEX)
1148     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate complex general\n");CHKERRQ(ierr);
1149 #else
1150     ierr = PetscViewerASCIIPrintf(viewer,"%%%%MatrixMarket matrix coordinate real general\n");CHKERRQ(ierr);
1151 #endif
1152     ierr = PetscViewerASCIIPrintf(viewer,"%D %D %D\n", m, A->cmap->n, a->nz);CHKERRQ(ierr);
1153     for (i=0; i<m; i++) {
1154       shift = a->sliidx[i>>3]+(i&0x07);
1155       for (j=0; j<a->rlen[i]; j++) {
1156 #if defined(PETSC_USE_COMPLEX)
1157         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1158 #else
1159         ierr = PetscViewerASCIIPrintf(viewer,"%D %D %g\n",i+fshift,a->colidx[shift+8*j]+fshift,(double)a->val[shift+8*j]);CHKERRQ(ierr);
1160 #endif
1161       }
1162     }
1163     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1164   } else if (format == PETSC_VIEWER_NATIVE) {
1165     for (i=0; i<a->totalslices; i++) { /* loop over slices */
1166       PetscInt row;
1167       ierr = PetscViewerASCIIPrintf(viewer,"slice %D: %D %D\n",i,a->sliidx[i],a->sliidx[i+1]);CHKERRQ(ierr);
1168       for (j=a->sliidx[i],row=0; j<a->sliidx[i+1]; j++,row=((row+1)&0x07)) {
1169 #if defined(PETSC_USE_COMPLEX)
1170         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1171           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g + %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1172         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1173           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g - %g i\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]),-(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1174         } else {
1175           ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1176         }
1177 #else
1178         ierr = PetscViewerASCIIPrintf(viewer,"  %D %D %g\n",8*i+row,a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1179 #endif
1180       }
1181     }
1182   } else {
1183     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);CHKERRQ(ierr);
1184     if (A->factortype) {
1185       for (i=0; i<m; i++) {
1186         shift = a->sliidx[i>>3]+(i&0x07);
1187         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1188         /* L part */
1189         for (j=shift; j<a->diag[i]; j+=8) {
1190 #if defined(PETSC_USE_COMPLEX)
1191           if (PetscImaginaryPart(a->val[shift+8*j]) > 0.0) {
1192             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1193           } else if (PetscImaginaryPart(a->val[shift+8*j]) < 0.0) {
1194             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1195           } else {
1196             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1197           }
1198 #else
1199           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1200 #endif
1201         }
1202         /* diagonal */
1203         j = a->diag[i];
1204 #if defined(PETSC_USE_COMPLEX)
1205         if (PetscImaginaryPart(a->val[j]) > 0.0) {
1206           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)PetscImaginaryPart(1.0/a->val[j]));CHKERRQ(ierr);
1207         } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1208           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]),(double)(-PetscImaginaryPart(1.0/a->val[j])));CHKERRQ(ierr);
1209         } else {
1210           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(1.0/a->val[j]));CHKERRQ(ierr);
1211         }
1212 #else
1213         ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)(1.0/a->val[j]));CHKERRQ(ierr);
1214 #endif
1215 
1216         /* U part */
1217         for (j=a->diag[i]+1; j<shift+8*a->rlen[i]; j+=8) {
1218 #if defined(PETSC_USE_COMPLEX)
1219           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1220             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)PetscImaginaryPart(a->val[j]));CHKERRQ(ierr);
1221           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1222             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[j],(double)PetscRealPart(a->val[j]),(double)(-PetscImaginaryPart(a->val[j])));CHKERRQ(ierr);
1223           } else {
1224             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)PetscRealPart(a->val[j]));CHKERRQ(ierr);
1225           }
1226 #else
1227           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[j],(double)a->val[j]);CHKERRQ(ierr);
1228 #endif
1229         }
1230         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1231       }
1232     } else {
1233       for (i=0; i<m; i++) {
1234         shift = a->sliidx[i>>3]+(i&0x07);
1235         ierr = PetscViewerASCIIPrintf(viewer,"row %D:",i);CHKERRQ(ierr);
1236         for (j=0; j<a->rlen[i]; j++) {
1237 #if defined(PETSC_USE_COMPLEX)
1238           if (PetscImaginaryPart(a->val[j]) > 0.0) {
1239             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g + %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1240           } else if (PetscImaginaryPart(a->val[j]) < 0.0) {
1241             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g - %g i)",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]),(double)-PetscImaginaryPart(a->val[shift+8*j]));CHKERRQ(ierr);
1242           } else {
1243             ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)PetscRealPart(a->val[shift+8*j]));CHKERRQ(ierr);
1244           }
1245 #else
1246           ierr = PetscViewerASCIIPrintf(viewer," (%D, %g) ",a->colidx[shift+8*j],(double)a->val[shift+8*j]);CHKERRQ(ierr);
1247 #endif
1248         }
1249         ierr = PetscViewerASCIIPrintf(viewer,"\n");CHKERRQ(ierr);
1250       }
1251     }
1252     ierr = PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);CHKERRQ(ierr);
1253   }
1254   ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
1255   PetscFunctionReturn(0);
1256 }
1257 
1258 #include <petscdraw.h>
1259 PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw,void *Aa)
1260 {
1261   Mat               A=(Mat)Aa;
1262   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1263   PetscInt          i,j,m=A->rmap->n,shift;
1264   int               color;
1265   PetscReal         xl,yl,xr,yr,x_l,x_r,y_l,y_r;
1266   PetscViewer       viewer;
1267   PetscViewerFormat format;
1268   PetscErrorCode    ierr;
1269 
1270   PetscFunctionBegin;
1271   ierr = PetscObjectQuery((PetscObject)A,"Zoomviewer",(PetscObject*)&viewer);CHKERRQ(ierr);
1272   ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
1273   ierr = PetscDrawGetCoordinates(draw,&xl,&yl,&xr,&yr);CHKERRQ(ierr);
1274 
1275   /* loop over matrix elements drawing boxes */
1276 
1277   if (format != PETSC_VIEWER_DRAW_CONTOUR) {
1278     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1279     /* Blue for negative, Cyan for zero and  Red for positive */
1280     color = PETSC_DRAW_BLUE;
1281     for (i=0; i<m; i++) {
1282       shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1283       y_l = m - i - 1.0; y_r = y_l + 1.0;
1284       for (j=0; j<a->rlen[i]; j++) {
1285         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1286         if (PetscRealPart(a->val[shift+8*j]) >=  0.) continue;
1287         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1288       }
1289     }
1290     color = PETSC_DRAW_CYAN;
1291     for (i=0; i<m; i++) {
1292       shift = a->sliidx[i>>3]+(i&0x07);
1293       y_l = m - i - 1.0; y_r = y_l + 1.0;
1294       for (j=0; j<a->rlen[i]; j++) {
1295         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1296         if (a->val[shift+8*j] !=  0.) continue;
1297         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1298       }
1299     }
1300     color = PETSC_DRAW_RED;
1301     for (i=0; i<m; i++) {
1302       shift = a->sliidx[i>>3]+(i&0x07);
1303       y_l = m - i - 1.0; y_r = y_l + 1.0;
1304       for (j=0; j<a->rlen[i]; j++) {
1305         x_l = a->colidx[shift+j*8]; x_r = x_l + 1.0;
1306         if (PetscRealPart(a->val[shift+8*j]) <=  0.) continue;
1307         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1308       }
1309     }
1310     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1311   } else {
1312     /* use contour shading to indicate magnitude of values */
1313     /* first determine max of all nonzero values */
1314     PetscReal minv=0.0,maxv=0.0;
1315     PetscInt  count=0;
1316     PetscDraw popup;
1317     for (i=0; i<a->sliidx[a->totalslices]; i++) {
1318       if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]);
1319     }
1320     if (minv >= maxv) maxv = minv + PETSC_SMALL;
1321     ierr = PetscDrawGetPopup(draw,&popup);CHKERRQ(ierr);
1322     ierr = PetscDrawScalePopup(popup,minv,maxv);CHKERRQ(ierr);
1323 
1324     ierr = PetscDrawCollectiveBegin(draw);CHKERRQ(ierr);
1325     for (i=0; i<m; i++) {
1326       shift = a->sliidx[i>>3]+(i&0x07);
1327       y_l = m - i - 1.0;
1328       y_r = y_l + 1.0;
1329       for (j=0; j<a->rlen[i]; j++) {
1330         x_l = a->colidx[shift+j*8];
1331         x_r = x_l + 1.0;
1332         color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]),minv,maxv);
1333         ierr = PetscDrawRectangle(draw,x_l,y_l,x_r,y_r,color,color,color,color);CHKERRQ(ierr);
1334         count++;
1335       }
1336     }
1337     ierr = PetscDrawCollectiveEnd(draw);CHKERRQ(ierr);
1338   }
1339   PetscFunctionReturn(0);
1340 }
1341 
1342 #include <petscdraw.h>
1343 PetscErrorCode MatView_SeqSELL_Draw(Mat A,PetscViewer viewer)
1344 {
1345   PetscDraw      draw;
1346   PetscReal      xr,yr,xl,yl,h,w;
1347   PetscBool      isnull;
1348   PetscErrorCode ierr;
1349 
1350   PetscFunctionBegin;
1351   ierr = PetscViewerDrawGetDraw(viewer,0,&draw);CHKERRQ(ierr);
1352   ierr = PetscDrawIsNull(draw,&isnull);CHKERRQ(ierr);
1353   if (isnull) PetscFunctionReturn(0);
1354 
1355   xr   = A->cmap->n; yr  = A->rmap->n; h = yr/10.0; w = xr/10.0;
1356   xr  += w;          yr += h;         xl = -w;     yl = -h;
1357   ierr = PetscDrawSetCoordinates(draw,xl,yl,xr,yr);CHKERRQ(ierr);
1358   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",(PetscObject)viewer);CHKERRQ(ierr);
1359   ierr = PetscDrawZoom(draw,MatView_SeqSELL_Draw_Zoom,A);CHKERRQ(ierr);
1360   ierr = PetscObjectCompose((PetscObject)A,"Zoomviewer",NULL);CHKERRQ(ierr);
1361   ierr = PetscDrawSave(draw);CHKERRQ(ierr);
1362   PetscFunctionReturn(0);
1363 }
1364 
1365 PetscErrorCode MatView_SeqSELL(Mat A,PetscViewer viewer)
1366 {
1367   PetscBool      iascii,isbinary,isdraw;
1368   PetscErrorCode ierr;
1369 
1370   PetscFunctionBegin;
1371   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
1372   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERBINARY,&isbinary);CHKERRQ(ierr);
1373   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERDRAW,&isdraw);CHKERRQ(ierr);
1374   if (iascii) {
1375     ierr = MatView_SeqSELL_ASCII(A,viewer);CHKERRQ(ierr);
1376   } else if (isbinary) {
1377     /* ierr = MatView_SeqSELL_Binary(A,viewer);CHKERRQ(ierr); */
1378   } else if (isdraw) {
1379     ierr = MatView_SeqSELL_Draw(A,viewer);CHKERRQ(ierr);
1380   }
1381   PetscFunctionReturn(0);
1382 }
1383 
1384 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A,MatAssemblyType mode)
1385 {
1386   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1387   PetscInt       i,shift,row_in_slice,row,nrow,*cp,lastcol,j,k;
1388   MatScalar      *vp;
1389   PetscErrorCode ierr;
1390 
1391   PetscFunctionBegin;
1392   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
1393   /* To do: compress out the unused elements */
1394   ierr = MatMarkDiagonal_SeqSELL(A);CHKERRQ(ierr);
1395   ierr = PetscInfo6(A,"Matrix size: %D X %D; storage space: %D allocated %D used (%D nonzeros+%D paddedzeros)\n",A->rmap->n,A->cmap->n,a->maxallocmat,a->sliidx[a->totalslices],a->nz,a->sliidx[a->totalslices]-a->nz);CHKERRQ(ierr);
1396   ierr = PetscInfo1(A,"Number of mallocs during MatSetValues() is %D\n",a->reallocs);CHKERRQ(ierr);
1397   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rlenmax);CHKERRQ(ierr);
1398   /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */
1399   for (i=0; i<a->totalslices; ++i) {
1400     shift = a->sliidx[i];    /* starting index of the slice */
1401     cp    = a->colidx+shift; /* pointer to the column indices of the slice */
1402     vp    = a->val+shift;    /* pointer to the nonzero values of the slice */
1403     for (row_in_slice=0; row_in_slice<8; ++row_in_slice) { /* loop over rows in the slice */
1404       row  = 8*i + row_in_slice;
1405       nrow = a->rlen[row]; /* number of nonzeros in row */
1406       /*
1407         Search for the nearest nonzero. Normally setting the index to zero may cause extra communication.
1408         But if the entire slice are empty, it is fine to use 0 since the index will not be loaded.
1409       */
1410       lastcol = 0;
1411       if (nrow>0) { /* nonempty row */
1412         lastcol = cp[8*(nrow-1)+row_in_slice]; /* use the index from the last nonzero at current row */
1413       } else if (!row_in_slice) { /* first row of the currect slice is empty */
1414         for (j=1;j<8;j++) {
1415           if (a->rlen[8*i+j]) {
1416             lastcol = cp[j];
1417             break;
1418           }
1419         }
1420       } else {
1421         if (a->sliidx[i+1] != shift) lastcol = cp[row_in_slice-1]; /* use the index from the previous row */
1422       }
1423 
1424       for (k=nrow; k<(a->sliidx[i+1]-shift)/8; ++k) {
1425         cp[8*k+row_in_slice] = lastcol;
1426         vp[8*k+row_in_slice] = (MatScalar)0;
1427       }
1428     }
1429   }
1430 
1431   A->info.mallocs += a->reallocs;
1432   a->reallocs      = 0;
1433 
1434   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1435   PetscFunctionReturn(0);
1436 }
1437 
1438 PetscErrorCode MatGetInfo_SeqSELL(Mat A,MatInfoType flag,MatInfo *info)
1439 {
1440   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1441 
1442   PetscFunctionBegin;
1443   info->block_size   = 1.0;
1444   info->nz_allocated = (double)a->maxallocmat;
1445   info->nz_used      = (double)a->sliidx[a->totalslices]; /* include padding zeros */
1446   info->nz_unneeded  = (double)(a->maxallocmat-a->sliidx[a->totalslices]);
1447   info->assemblies   = (double)A->num_ass;
1448   info->mallocs      = (double)A->info.mallocs;
1449   info->memory       = ((PetscObject)A)->mem;
1450   if (A->factortype) {
1451     info->fill_ratio_given  = A->info.fill_ratio_given;
1452     info->fill_ratio_needed = A->info.fill_ratio_needed;
1453     info->factor_mallocs    = A->info.factor_mallocs;
1454   } else {
1455     info->fill_ratio_given  = 0;
1456     info->fill_ratio_needed = 0;
1457     info->factor_mallocs    = 0;
1458   }
1459   PetscFunctionReturn(0);
1460 }
1461 
1462 PetscErrorCode MatSetValues_SeqSELL(Mat A,PetscInt m,const PetscInt im[],PetscInt n,const PetscInt in[],const PetscScalar v[],InsertMode is)
1463 {
1464   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1465   PetscInt       shift,i,k,l,low,high,t,ii,row,col,nrow;
1466   PetscInt       *cp,nonew=a->nonew,lastcol=-1;
1467   MatScalar      *vp,value;
1468   PetscErrorCode ierr;
1469 
1470   PetscFunctionBegin;
1471   for (k=0; k<m; k++) { /* loop over added rows */
1472     row = im[k];
1473     if (row < 0) continue;
1474 #if defined(PETSC_USE_DEBUG)
1475     if (row >= A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Row too large: row %D max %D",row,A->rmap->n-1);
1476 #endif
1477     shift = a->sliidx[row>>3]+(row&0x07); /* starting index of the row */
1478     cp    = a->colidx+shift; /* pointer to the row */
1479     vp    = a->val+shift; /* pointer to the row */
1480     nrow  = a->rlen[row];
1481     low   = 0;
1482     high  = nrow;
1483 
1484     for (l=0; l<n; l++) { /* loop over added columns */
1485       col = in[l];
1486       if (col<0) continue;
1487 #if defined(PETSC_USE_DEBUG)
1488       if (col >= A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Col too large: row %D max %D",col,A->cmap->n-1);
1489 #endif
1490       if (a->roworiented) {
1491         value = v[l+k*n];
1492       } else {
1493         value = v[k+l*m];
1494       }
1495       if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue;
1496 
1497       /* search in this row for the specified colmun, i indicates the column to be set */
1498       if (col <= lastcol) low = 0;
1499       else high = nrow;
1500       lastcol = col;
1501       while (high-low > 5) {
1502         t = (low+high)/2;
1503         if (*(cp+t*8) > col) high = t;
1504         else low = t;
1505       }
1506       for (i=low; i<high; i++) {
1507         if (*(cp+i*8) > col) break;
1508         if (*(cp+i*8) == col) {
1509           if (is == ADD_VALUES) *(vp+i*8) += value;
1510           else *(vp+i*8) = value;
1511           low = i + 1;
1512           goto noinsert;
1513         }
1514       }
1515       if (value == 0.0 && a->ignorezeroentries) goto noinsert;
1516       if (nonew == 1) goto noinsert;
1517       if (nonew == -1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Inserting a new nonzero (%D, %D) in the matrix", row, col);
1518       /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */
1519       MatSeqXSELLReallocateSELL(A,A->rmap->n,1,nrow,a->sliidx,row/8,row,col,a->colidx,a->val,cp,vp,nonew,MatScalar);
1520       /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */
1521       for (ii=nrow-1; ii>=i; ii--) {
1522         *(cp+(ii+1)*8) = *(cp+ii*8);
1523         *(vp+(ii+1)*8) = *(vp+ii*8);
1524       }
1525       a->rlen[row]++;
1526       *(cp+i*8) = col;
1527       *(vp+i*8) = value;
1528       a->nz++;
1529       A->nonzerostate++;
1530       low = i+1; high++; nrow++;
1531 noinsert:;
1532     }
1533     a->rlen[row] = nrow;
1534   }
1535   PetscFunctionReturn(0);
1536 }
1537 
1538 PetscErrorCode MatCopy_SeqSELL(Mat A,Mat B,MatStructure str)
1539 {
1540   PetscErrorCode ierr;
1541 
1542   PetscFunctionBegin;
1543   /* If the two matrices have the same copy implementation, use fast copy. */
1544   if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) {
1545     Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1546     Mat_SeqSELL *b=(Mat_SeqSELL*)B->data;
1547 
1548     if (a->sliidx[a->totalslices] != b->sliidx[b->totalslices]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Number of nonzeros in two matrices are different");
1549     ierr = PetscMemcpy(b->val,a->val,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1550   } else {
1551     ierr = MatCopy_Basic(A,B,str);CHKERRQ(ierr);
1552   }
1553   PetscFunctionReturn(0);
1554 }
1555 
1556 PetscErrorCode MatSetUp_SeqSELL(Mat A)
1557 {
1558   PetscErrorCode ierr;
1559 
1560   PetscFunctionBegin;
1561   ierr = MatSeqSELLSetPreallocation(A,PETSC_DEFAULT,0);CHKERRQ(ierr);
1562   PetscFunctionReturn(0);
1563 }
1564 
1565 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A,PetscScalar *array[])
1566 {
1567   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1568 
1569   PetscFunctionBegin;
1570   *array = a->val;
1571   PetscFunctionReturn(0);
1572 }
1573 
1574 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A,PetscScalar *array[])
1575 {
1576   PetscFunctionBegin;
1577   PetscFunctionReturn(0);
1578 }
1579 
1580 PetscErrorCode MatRealPart_SeqSELL(Mat A)
1581 {
1582   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
1583   PetscInt    i;
1584   MatScalar   *aval=a->val;
1585 
1586   PetscFunctionBegin;
1587   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i]=PetscRealPart(aval[i]);
1588   PetscFunctionReturn(0);
1589 }
1590 
1591 PetscErrorCode MatImaginaryPart_SeqSELL(Mat A)
1592 {
1593   Mat_SeqSELL    *a=(Mat_SeqSELL*)A->data;
1594   PetscInt       i;
1595   MatScalar      *aval=a->val;
1596   PetscErrorCode ierr;
1597 
1598   PetscFunctionBegin;
1599   for (i=0; i<a->sliidx[a->totalslices]; i++) aval[i] = PetscImaginaryPart(aval[i]);
1600   ierr = MatSeqSELLInvalidateDiagonal(A);CHKERRQ(ierr);
1601   PetscFunctionReturn(0);
1602 }
1603 
1604 PetscErrorCode MatScale_SeqSELL(Mat inA,PetscScalar alpha)
1605 {
1606   Mat_SeqSELL    *a=(Mat_SeqSELL*)inA->data;
1607   MatScalar      *aval=a->val;
1608   PetscScalar    oalpha=alpha;
1609   PetscBLASInt   one=1,size;
1610   PetscErrorCode ierr;
1611 
1612   PetscFunctionBegin;
1613   ierr = PetscBLASIntCast(a->sliidx[a->totalslices],&size);CHKERRQ(ierr);
1614   PetscStackCallBLAS("BLASscal",BLASscal_(&size,&oalpha,aval,&one));
1615   ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1616   ierr = MatSeqSELLInvalidateDiagonal(inA);CHKERRQ(ierr);
1617   PetscFunctionReturn(0);
1618 }
1619 
1620 PetscErrorCode MatShift_SeqSELL(Mat Y,PetscScalar a)
1621 {
1622   Mat_SeqSELL    *y=(Mat_SeqSELL*)Y->data;
1623   PetscErrorCode ierr;
1624 
1625   PetscFunctionBegin;
1626   if (!Y->preallocated || !y->nz) {
1627     ierr = MatSeqSELLSetPreallocation(Y,1,NULL);CHKERRQ(ierr);
1628   }
1629   ierr = MatShift_Basic(Y,a);CHKERRQ(ierr);
1630   PetscFunctionReturn(0);
1631 }
1632 
1633 PetscErrorCode MatSOR_SeqSELL(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
1634 {
1635   Mat_SeqSELL       *a=(Mat_SeqSELL*)A->data;
1636   PetscScalar       *x,sum,*t;
1637   const MatScalar   *idiag=0,*mdiag;
1638   const PetscScalar *b,*xb;
1639   PetscInt          n,m=A->rmap->n,i,j,shift;
1640   const PetscInt    *diag;
1641   PetscErrorCode    ierr;
1642 
1643   PetscFunctionBegin;
1644   its = its*lits;
1645 
1646   if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */
1647   if (!a->idiagvalid) {ierr = MatInvertDiagonal_SeqSELL(A,omega,fshift);CHKERRQ(ierr);}
1648   a->fshift = fshift;
1649   a->omega  = omega;
1650 
1651   diag  = a->diag;
1652   t     = a->ssor_work;
1653   idiag = a->idiag;
1654   mdiag = a->mdiag;
1655 
1656   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1657   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1658   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
1659   if (flag == SOR_APPLY_UPPER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_UPPER is not implemented");
1660   if (flag == SOR_APPLY_LOWER) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SOR_APPLY_LOWER is not implemented");
1661   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
1662 
1663   if (flag & SOR_ZERO_INITIAL_GUESS) {
1664     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1665       for (i=0; i<m; i++) {
1666         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1667         sum   = b[i];
1668         n     = (diag[i]-shift)/8;
1669         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1670         t[i]  = sum;
1671         x[i]  = sum*idiag[i];
1672       }
1673       xb   = t;
1674       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
1675     } else xb = b;
1676     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1677       for (i=m-1; i>=0; i--) {
1678         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1679         sum   = xb[i];
1680         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1681         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1682         if (xb == b) {
1683           x[i] = sum*idiag[i];
1684         } else {
1685           x[i] = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1686         }
1687       }
1688       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1689     }
1690     its--;
1691   }
1692   while (its--) {
1693     if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) {
1694       for (i=0; i<m; i++) {
1695         /* lower */
1696         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1697         sum   = b[i];
1698         n     = (diag[i]-shift)/8;
1699         for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1700         t[i]  = sum;             /* save application of the lower-triangular part */
1701         /* upper */
1702         n     = a->rlen[i]-(diag[i]-shift)/8-1;
1703         for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1704         x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1705       }
1706       xb   = t;
1707       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1708     } else xb = b;
1709     if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) {
1710       for (i=m-1; i>=0; i--) {
1711         shift = a->sliidx[i>>3]+(i&0x07); /* starting index of the row i */
1712         sum = xb[i];
1713         if (xb == b) {
1714           /* whole matrix (no checkpointing available) */
1715           n     = a->rlen[i];
1716           for (j=0; j<n; j++) sum -= a->val[shift+j*8]*x[a->colidx[shift+j*8]];
1717           x[i] = (1.-omega)*x[i]+(sum+mdiag[i]*x[i])*idiag[i];
1718         } else { /* lower-triangular part has been saved, so only apply upper-triangular */
1719           n     = a->rlen[i]-(diag[i]-shift)/8-1;
1720           for (j=1; j<=n; j++) sum -= a->val[diag[i]+j*8]*x[a->colidx[diag[i]+j*8]];
1721           x[i]  = (1.-omega)*x[i]+sum*idiag[i];  /* omega in idiag */
1722         }
1723       }
1724       if (xb == b) {
1725         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1726       } else {
1727         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper */
1728       }
1729     }
1730   }
1731   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1732   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1733   PetscFunctionReturn(0);
1734 }
1735 
1736 /* -------------------------------------------------------------------*/
1737 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL,
1738                                        MatGetRow_SeqSELL,
1739                                        MatRestoreRow_SeqSELL,
1740                                        MatMult_SeqSELL,
1741                                /* 4*/  MatMultAdd_SeqSELL,
1742                                        MatMultTranspose_SeqSELL,
1743                                        MatMultTransposeAdd_SeqSELL,
1744                                        0,
1745                                        0,
1746                                        0,
1747                                /* 10*/ 0,
1748                                        0,
1749                                        0,
1750                                        MatSOR_SeqSELL,
1751                                        0,
1752                                /* 15*/ MatGetInfo_SeqSELL,
1753                                        MatEqual_SeqSELL,
1754                                        MatGetDiagonal_SeqSELL,
1755                                        MatDiagonalScale_SeqSELL,
1756                                        0,
1757                                /* 20*/ 0,
1758                                        MatAssemblyEnd_SeqSELL,
1759                                        MatSetOption_SeqSELL,
1760                                        MatZeroEntries_SeqSELL,
1761                                /* 24*/ 0,
1762                                        0,
1763                                        0,
1764                                        0,
1765                                        0,
1766                                /* 29*/ MatSetUp_SeqSELL,
1767                                        0,
1768                                        0,
1769                                        0,
1770                                        0,
1771                                /* 34*/ MatDuplicate_SeqSELL,
1772                                        0,
1773                                        0,
1774                                        0,
1775                                        0,
1776                                /* 39*/ 0,
1777                                        0,
1778                                        0,
1779                                        MatGetValues_SeqSELL,
1780                                        MatCopy_SeqSELL,
1781                                /* 44*/ 0,
1782                                        MatScale_SeqSELL,
1783                                        MatShift_SeqSELL,
1784                                        0,
1785                                        0,
1786                                /* 49*/ 0,
1787                                        0,
1788                                        0,
1789                                        0,
1790                                        0,
1791                                /* 54*/ MatFDColoringCreate_SeqXAIJ,
1792                                        0,
1793                                        0,
1794                                        0,
1795                                        0,
1796                                /* 59*/ 0,
1797                                        MatDestroy_SeqSELL,
1798                                        MatView_SeqSELL,
1799                                        0,
1800                                        0,
1801                                /* 64*/ 0,
1802                                        0,
1803                                        0,
1804                                        0,
1805                                        0,
1806                                /* 69*/ 0,
1807                                        0,
1808                                        0,
1809                                        0,
1810                                        0,
1811                                /* 74*/ 0,
1812                                        MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */
1813                                        0,
1814                                        0,
1815                                        0,
1816                                /* 79*/ 0,
1817                                        0,
1818                                        0,
1819                                        0,
1820                                        0,
1821                                /* 84*/ 0,
1822                                        0,
1823                                        0,
1824                                        0,
1825                                        0,
1826                                /* 89*/ 0,
1827                                        0,
1828                                        0,
1829                                        0,
1830                                        0,
1831                                /* 94*/ 0,
1832                                        0,
1833                                        0,
1834                                        0,
1835                                        0,
1836                                /* 99*/ 0,
1837                                        0,
1838                                        0,
1839                                        MatConjugate_SeqSELL,
1840                                        0,
1841                                /*104*/ 0,
1842                                        0,
1843                                        0,
1844                                        0,
1845                                        0,
1846                                /*109*/ 0,
1847                                        0,
1848                                        0,
1849                                        0,
1850                                        MatMissingDiagonal_SeqSELL,
1851                                /*114*/ 0,
1852                                        0,
1853                                        0,
1854                                        0,
1855                                        0,
1856                                /*119*/ 0,
1857                                        0,
1858                                        0,
1859                                        0,
1860                                        0,
1861                                /*124*/ 0,
1862                                        0,
1863                                        0,
1864                                        0,
1865                                        0,
1866                                /*129*/ 0,
1867                                        0,
1868                                        0,
1869                                        0,
1870                                        0,
1871                                /*134*/ 0,
1872                                        0,
1873                                        0,
1874                                        0,
1875                                        0,
1876                                /*139*/ 0,
1877                                        0,
1878                                        0,
1879                                        MatFDColoringSetUp_SeqXAIJ,
1880                                        0,
1881                                 /*144*/0
1882 };
1883 
1884 PetscErrorCode MatStoreValues_SeqSELL(Mat mat)
1885 {
1886   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1887   PetscErrorCode ierr;
1888 
1889   PetscFunctionBegin;
1890   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1891 
1892   /* allocate space for values if not already there */
1893   if (!a->saved_values) {
1894     ierr = PetscMalloc1(a->sliidx[a->totalslices]+1,&a->saved_values);CHKERRQ(ierr);
1895     ierr = PetscLogObjectMemory((PetscObject)mat,(a->sliidx[a->totalslices]+1)*sizeof(PetscScalar));CHKERRQ(ierr);
1896   }
1897 
1898   /* copy values over */
1899   ierr = PetscMemcpy(a->saved_values,a->val,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1900   PetscFunctionReturn(0);
1901 }
1902 
1903 PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat)
1904 {
1905   Mat_SeqSELL    *a=(Mat_SeqSELL*)mat->data;
1906   PetscErrorCode ierr;
1907 
1908   PetscFunctionBegin;
1909   if (!a->nonew) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first");
1910   if (!a->saved_values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ORDER,"Must call MatStoreValues(A);first");
1911   /* copy values over */
1912   ierr = PetscMemcpy(a->val,a->saved_values,a->sliidx[a->totalslices]*sizeof(PetscScalar));CHKERRQ(ierr);
1913   PetscFunctionReturn(0);
1914 }
1915 
1916 /*@C
1917  MatSeqSELLRestoreArray - returns access to the array where the data for a MATSEQSELL matrix is stored obtained by MatSeqSELLGetArray()
1918 
1919  Not Collective
1920 
1921  Input Parameters:
1922  .  mat - a MATSEQSELL matrix
1923  .  array - pointer to the data
1924 
1925  Level: intermediate
1926 
1927  .seealso: MatSeqSELLGetArray(), MatSeqSELLRestoreArrayF90()
1928  @*/
1929 PetscErrorCode MatSeqSELLRestoreArray(Mat A,PetscScalar **array)
1930 {
1931   PetscErrorCode ierr;
1932 
1933   PetscFunctionBegin;
1934   ierr = PetscUseMethod(A,"MatSeqSELLRestoreArray_C",(Mat,PetscScalar**),(A,array));CHKERRQ(ierr);
1935   PetscFunctionReturn(0);
1936 }
1937 
1938 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B)
1939 {
1940   Mat_SeqSELL    *b;
1941   PetscMPIInt    size;
1942   PetscErrorCode ierr;
1943 
1944   PetscFunctionBegin;
1945   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr);
1946   if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Comm must be of size 1");
1947 
1948   ierr = PetscNewLog(B,&b);CHKERRQ(ierr);
1949 
1950   B->data = (void*)b;
1951 
1952   ierr = PetscMemcpy(B->ops,&MatOps_Values,sizeof(struct _MatOps));CHKERRQ(ierr);
1953 
1954   b->row                = 0;
1955   b->col                = 0;
1956   b->icol               = 0;
1957   b->reallocs           = 0;
1958   b->ignorezeroentries  = PETSC_FALSE;
1959   b->roworiented        = PETSC_TRUE;
1960   b->nonew              = 0;
1961   b->diag               = 0;
1962   b->solve_work         = 0;
1963   B->spptr              = 0;
1964   b->saved_values       = 0;
1965   b->idiag              = 0;
1966   b->mdiag              = 0;
1967   b->ssor_work          = 0;
1968   b->omega              = 1.0;
1969   b->fshift             = 0.0;
1970   b->idiagvalid         = PETSC_FALSE;
1971   b->ibdiagvalid        = PETSC_FALSE;
1972   b->keepnonzeropattern = PETSC_FALSE;
1973 
1974   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQSELL);CHKERRQ(ierr);
1975   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLGetArray_C",MatSeqSELLGetArray_SeqSELL);CHKERRQ(ierr);
1976   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLRestoreArray_C",MatSeqSELLRestoreArray_SeqSELL);CHKERRQ(ierr);
1977   ierr = PetscObjectComposeFunction((PetscObject)B,"MatStoreValues_C",MatStoreValues_SeqSELL);CHKERRQ(ierr);
1978   ierr = PetscObjectComposeFunction((PetscObject)B,"MatRetrieveValues_C",MatRetrieveValues_SeqSELL);CHKERRQ(ierr);
1979   ierr = PetscObjectComposeFunction((PetscObject)B,"MatSeqSELLSetPreallocation_C",MatSeqSELLSetPreallocation_SeqSELL);CHKERRQ(ierr);
1980   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqsell_seqaij_C",MatConvert_SeqSELL_SeqAIJ);CHKERRQ(ierr);
1981   PetscFunctionReturn(0);
1982 }
1983 
1984 /*
1985  Given a matrix generated with MatGetFactor() duplicates all the information in A into B
1986  */
1987 PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C,Mat A,MatDuplicateOption cpvalues,PetscBool mallocmatspace)
1988 {
1989   Mat_SeqSELL    *c,*a=(Mat_SeqSELL*)A->data;
1990   PetscInt       i,m=A->rmap->n;
1991   PetscInt       totalslices=a->totalslices;
1992   PetscErrorCode ierr;
1993 
1994   PetscFunctionBegin;
1995   c = (Mat_SeqSELL*)C->data;
1996 
1997   C->factortype = A->factortype;
1998   c->row        = 0;
1999   c->col        = 0;
2000   c->icol       = 0;
2001   c->reallocs   = 0;
2002 
2003   C->assembled = PETSC_TRUE;
2004 
2005   ierr = PetscLayoutReference(A->rmap,&C->rmap);CHKERRQ(ierr);
2006   ierr = PetscLayoutReference(A->cmap,&C->cmap);CHKERRQ(ierr);
2007 
2008   ierr = PetscMalloc1(8*totalslices,&c->rlen);CHKERRQ(ierr);
2009   ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2010   ierr = PetscMalloc1(totalslices+1,&c->sliidx);CHKERRQ(ierr);
2011   ierr = PetscLogObjectMemory((PetscObject)C, (totalslices+1)*sizeof(PetscInt));CHKERRQ(ierr);
2012 
2013   for (i=0; i<m; i++) c->rlen[i] = a->rlen[i];
2014   for (i=0; i<totalslices+1; i++) c->sliidx[i] = a->sliidx[i];
2015 
2016   /* allocate the matrix space */
2017   if (mallocmatspace) {
2018     ierr = PetscMalloc2(a->maxallocmat,&c->val,a->maxallocmat,&c->colidx);CHKERRQ(ierr);
2019     ierr = PetscLogObjectMemory((PetscObject)C,a->maxallocmat*(sizeof(PetscScalar)+sizeof(PetscInt)));CHKERRQ(ierr);
2020 
2021     c->singlemalloc = PETSC_TRUE;
2022 
2023     if (m > 0) {
2024       ierr = PetscMemcpy(c->colidx,a->colidx,(a->maxallocmat)*sizeof(PetscInt));CHKERRQ(ierr);
2025       if (cpvalues == MAT_COPY_VALUES) {
2026         ierr = PetscMemcpy(c->val,a->val,a->maxallocmat*sizeof(PetscScalar));CHKERRQ(ierr);
2027       } else {
2028         ierr = PetscMemzero(c->val,a->maxallocmat*sizeof(PetscScalar));CHKERRQ(ierr);
2029       }
2030     }
2031   }
2032 
2033   c->ignorezeroentries = a->ignorezeroentries;
2034   c->roworiented       = a->roworiented;
2035   c->nonew             = a->nonew;
2036   if (a->diag) {
2037     ierr = PetscMalloc1(m,&c->diag);CHKERRQ(ierr);
2038     ierr = PetscLogObjectMemory((PetscObject)C,m*sizeof(PetscInt));CHKERRQ(ierr);
2039     for (i=0; i<m; i++) {
2040       c->diag[i] = a->diag[i];
2041     }
2042   } else c->diag = 0;
2043 
2044   c->solve_work         = 0;
2045   c->saved_values       = 0;
2046   c->idiag              = 0;
2047   c->ssor_work          = 0;
2048   c->keepnonzeropattern = a->keepnonzeropattern;
2049   c->free_val           = PETSC_TRUE;
2050   c->free_colidx        = PETSC_TRUE;
2051 
2052   c->maxallocmat  = a->maxallocmat;
2053   c->maxallocrow  = a->maxallocrow;
2054   c->rlenmax      = a->rlenmax;
2055   c->nz           = a->nz;
2056   C->preallocated = PETSC_TRUE;
2057 
2058   c->nonzerorowcnt = a->nonzerorowcnt;
2059   C->nonzerostate  = A->nonzerostate;
2060 
2061   ierr = PetscFunctionListDuplicate(((PetscObject)A)->qlist,&((PetscObject)C)->qlist);CHKERRQ(ierr);
2062   PetscFunctionReturn(0);
2063 }
2064 
2065 PetscErrorCode MatDuplicate_SeqSELL(Mat A,MatDuplicateOption cpvalues,Mat *B)
2066 {
2067   PetscErrorCode ierr;
2068 
2069   PetscFunctionBegin;
2070   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
2071   ierr = MatSetSizes(*B,A->rmap->n,A->cmap->n,A->rmap->n,A->cmap->n);CHKERRQ(ierr);
2072   if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) {
2073     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
2074   }
2075   ierr = MatSetType(*B,((PetscObject)A)->type_name);CHKERRQ(ierr);
2076   ierr = MatDuplicateNoCreate_SeqSELL(*B,A,cpvalues,PETSC_TRUE);CHKERRQ(ierr);
2077   PetscFunctionReturn(0);
2078 }
2079 
2080 /*@C
2081  MatCreateSeqSELL - Creates a sparse matrix in SELL format.
2082 
2083  Collective on MPI_Comm
2084 
2085  Input Parameters:
2086  +  comm - MPI communicator, set to PETSC_COMM_SELF
2087  .  m - number of rows
2088  .  n - number of columns
2089  .  rlenmax - maximum number of nonzeros in a row
2090  -  rlen - array containing the number of nonzeros in the various rows
2091  (possibly different for each row) or NULL
2092 
2093  Output Parameter:
2094  .  A - the matrix
2095 
2096  It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2097  MatXXXXSetPreallocation() paradgm instead of this routine directly.
2098  [MatXXXXSetPreallocation() is, for example, MatSeqSELLSetPreallocation]
2099 
2100  Notes:
2101  If nnz is given then nz is ignored
2102 
2103  Specify the preallocated storage with either rlenmax or rlen (not both).
2104  Set rlenmax=PETSC_DEFAULT and rlen=NULL for PETSc to control dynamic memory
2105  allocation.  For large problems you MUST preallocate memory or you
2106  will get TERRIBLE performance, see the users' manual chapter on matrices.
2107 
2108  Level: intermediate
2109 
2110  .seealso: MatCreate(), MatCreateSELL(), MatSetValues(), MatCreateSeqSELLWithArrays()
2111 
2112  @*/
2113 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt maxallocrow,const PetscInt rlen[],Mat *A)
2114 {
2115   PetscErrorCode ierr;
2116 
2117   PetscFunctionBegin;
2118   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2119   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2120   ierr = MatSetType(*A,MATSEQSELL);CHKERRQ(ierr);
2121   ierr = MatSeqSELLSetPreallocation_SeqSELL(*A,maxallocrow,rlen);CHKERRQ(ierr);
2122   PetscFunctionReturn(0);
2123 }
2124 
2125 PetscErrorCode MatEqual_SeqSELL(Mat A,Mat B,PetscBool * flg)
2126 {
2127   Mat_SeqSELL     *a=(Mat_SeqSELL*)A->data,*b=(Mat_SeqSELL*)B->data;
2128   PetscInt       totalslices=a->totalslices;
2129   PetscErrorCode ierr;
2130 
2131   PetscFunctionBegin;
2132   /* If the  matrix dimensions are not equal,or no of nonzeros */
2133   if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) ||(a->nz != b->nz) || (a->rlenmax != b->rlenmax)) {
2134     *flg = PETSC_FALSE;
2135     PetscFunctionReturn(0);
2136   }
2137   /* if the a->colidx are the same */
2138   ierr = PetscMemcmp(a->colidx,b->colidx,a->sliidx[totalslices]*sizeof(PetscInt),flg);CHKERRQ(ierr);
2139   if (!*flg) PetscFunctionReturn(0);
2140   /* if a->val are the same */
2141   ierr = PetscMemcmp(a->val,b->val,a->sliidx[totalslices]*sizeof(PetscScalar),flg);CHKERRQ(ierr);
2142   PetscFunctionReturn(0);
2143 }
2144 
2145 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A)
2146 {
2147   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2148 
2149   PetscFunctionBegin;
2150   a->idiagvalid  = PETSC_FALSE;
2151   a->ibdiagvalid = PETSC_FALSE;
2152   PetscFunctionReturn(0);
2153 }
2154 
2155 PetscErrorCode MatConjugate_SeqSELL(Mat A)
2156 {
2157 #if defined(PETSC_USE_COMPLEX)
2158   Mat_SeqSELL *a=(Mat_SeqSELL*)A->data;
2159   PetscInt    i;
2160   PetscScalar *val = a->val;
2161 
2162   PetscFunctionBegin;
2163   for (i=0; i<a->sliidx[a->totalslices]; i++) {
2164     val[i] = PetscConj(val[i]);
2165   }
2166 #else
2167   PetscFunctionBegin;
2168 #endif
2169   PetscFunctionReturn(0);
2170 }
2171