xref: /petsc/src/mat/impls/aij/seq/aijperm/aijperm.c (revision ebead697dbf761eb322f829370bbe90b3bd93fa3)
1 
2 /*
3   Defines basic operations for the MATSEQAIJPERM matrix class.
4   This class is derived from the MATSEQAIJ class and retains the
5   compressed row storage (aka Yale sparse matrix format) but augments
6   it with some permutation information that enables some operations
7   to be more vectorizable.  A physically rearranged copy of the matrix
8   may be stored if the user desires.
9 
10   Eventually a variety of permutations may be supported.
11 */
12 
13 #include <../src/mat/impls/aij/seq/aij.h>
14 
15 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16 #include <immintrin.h>
17 
18 #if !defined(_MM_SCALE_8)
19 #define _MM_SCALE_8    8
20 #endif
21 #if !defined(_MM_SCALE_4)
22 #define _MM_SCALE_4    4
23 #endif
24 #endif
25 
26 #define NDIM 512
27 /* NDIM specifies how many rows at a time we should work with when
28  * performing the vectorized mat-vec.  This depends on various factors
29  * such as vector register length, etc., and I really need to add a
30  * way for the user (or the library) to tune this.  I'm setting it to
31  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32  * routines. */
33 
34 typedef struct {
35   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
36 
37   PetscInt         ngroup;
38   PetscInt         *xgroup;
39   /* Denotes where groups of rows with same number of nonzeros
40    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41    * where the ith group begins. */
42 
43   PetscInt         *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
44   PetscInt         *iperm;  /* The permutation vector. */
45 
46   /* Some of this stuff is for Ed's recursive triangular solve.
47    * I'm not sure what I need yet. */
48   PetscInt         blocksize;
49   PetscInt         nstep;
50   PetscInt         *jstart_list;
51   PetscInt         *jend_list;
52   PetscInt         *action_list;
53   PetscInt         *ngroup_list;
54   PetscInt         **ipointer_list;
55   PetscInt         **xgroup_list;
56   PetscInt         **nzgroup_list;
57   PetscInt         **iperm_list;
58 } Mat_SeqAIJPERM;
59 
60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
61 {
62   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
63   /* so we will ignore 'MatType type'. */
64   Mat            B       = *newmat;
65   Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr;
66 
67   PetscFunctionBegin;
68   if (reuse == MAT_INITIAL_MATRIX) {
69     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,&B));
70     aijperm=(Mat_SeqAIJPERM*)B->spptr;
71   }
72 
73   /* Reset the original function pointers. */
74   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
75   B->ops->destroy     = MatDestroy_SeqAIJ;
76   B->ops->duplicate   = MatDuplicate_SeqAIJ;
77   B->ops->mult        = MatMult_SeqAIJ;
78   B->ops->multadd     = MatMultAdd_SeqAIJ;
79 
80   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL));
81 
82   /* Free everything in the Mat_SeqAIJPERM data structure.*/
83   PetscCall(PetscFree(aijperm->xgroup));
84   PetscCall(PetscFree(aijperm->nzgroup));
85   PetscCall(PetscFree(aijperm->iperm));
86   PetscCall(PetscFree(B->spptr));
87 
88   /* Change the type of B to MATSEQAIJ. */
89   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ));
90 
91   *newmat = B;
92   PetscFunctionReturn(0);
93 }
94 
95 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
96 {
97   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
98 
99   PetscFunctionBegin;
100   if (aijperm) {
101     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
102     PetscCall(PetscFree(aijperm->xgroup));
103     PetscCall(PetscFree(aijperm->nzgroup));
104     PetscCall(PetscFree(aijperm->iperm));
105     PetscCall(PetscFree(A->spptr));
106   }
107   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
108    * to destroy everything that remains. */
109   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ));
110   /* Note that I don't call MatSetType().  I believe this is because that
111    * is only to be called when *building* a matrix.  I could be wrong, but
112    * that is how things work for the SuperLU matrix class. */
113   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijperm_seqaij_C",NULL));
114   PetscCall(MatDestroy_SeqAIJ(A));
115   PetscFunctionReturn(0);
116 }
117 
118 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
119 {
120   Mat_SeqAIJPERM *aijperm      = (Mat_SeqAIJPERM*) A->spptr;
121   Mat_SeqAIJPERM *aijperm_dest;
122   PetscBool      perm;
123 
124   PetscFunctionBegin;
125   PetscCall(MatDuplicate_SeqAIJ(A,op,M));
126   PetscCall(PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm));
127   if (perm) {
128     aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr;
129     PetscCall(PetscFree(aijperm_dest->xgroup));
130     PetscCall(PetscFree(aijperm_dest->nzgroup));
131     PetscCall(PetscFree(aijperm_dest->iperm));
132   } else {
133     PetscCall(PetscNewLog(*M,&aijperm_dest));
134     (*M)->spptr = (void*) aijperm_dest;
135     PetscCall(PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM));
136     PetscCall(PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ));
137   }
138   PetscCall(PetscArraycpy(aijperm_dest,aijperm,1));
139   /* Allocate space for, and copy the grouping and permutation info.
140    * I note that when the groups are initially determined in
141    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
142    * necessary.  But at this point, we know how large they need to be, and
143    * allocate only the necessary amount of memory.  So the duplicated matrix
144    * may actually use slightly less storage than the original! */
145   PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm));
146   PetscCall(PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup));
147   PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup));
148   PetscCall(PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n));
149   PetscCall(PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1));
150   PetscCall(PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup));
151   PetscFunctionReturn(0);
152 }
153 
154 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
155 {
156   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)(A)->data;
157   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
158   PetscInt       m;       /* Number of rows in the matrix. */
159   PetscInt       *ia;       /* From the CSR representation; points to the beginning  of each row. */
160   PetscInt       maxnz;      /* Maximum number of nonzeros in any row. */
161   PetscInt       *rows_in_bucket;
162   /* To construct the permutation, we sort each row into one of maxnz
163    * buckets based on how many nonzeros are in the row. */
164   PetscInt       nz;
165   PetscInt       *nz_in_row;         /* the number of nonzero elements in row k. */
166   PetscInt       *ipnz;
167   /* When constructing the iperm permutation vector,
168    * ipnz[nz] is used to point to the next place in the permutation vector
169    * that a row with nz nonzero elements should be placed.*/
170   PetscInt       i, ngroup, istart, ipos;
171 
172   PetscFunctionBegin;
173   if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */
174   aijperm->nonzerostate = A->nonzerostate;
175  /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
176   PetscCall(PetscFree(aijperm->xgroup));
177   PetscCall(PetscFree(aijperm->nzgroup));
178   PetscCall(PetscFree(aijperm->iperm));
179 
180   m  = A->rmap->n;
181   ia = a->i;
182 
183   /* Allocate the arrays that will hold the permutation vector. */
184   PetscCall(PetscMalloc1(m, &aijperm->iperm));
185 
186   /* Allocate some temporary work arrays that will be used in
187    * calculating the permutation vector and groupings. */
188   PetscCall(PetscMalloc1(m, &nz_in_row));
189 
190   /* Now actually figure out the permutation and grouping. */
191 
192   /* First pass: Determine number of nonzeros in each row, maximum
193    * number of nonzeros in any row, and how many rows fall into each
194    * "bucket" of rows with same number of nonzeros. */
195   maxnz = 0;
196   for (i=0; i<m; i++) {
197     nz_in_row[i] = ia[i+1]-ia[i];
198     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
199   }
200   PetscCall(PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket));
201   PetscCall(PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz));
202 
203   for (i=0; i<=maxnz; i++) {
204     rows_in_bucket[i] = 0;
205   }
206   for (i=0; i<m; i++) {
207     nz = nz_in_row[i];
208     rows_in_bucket[nz]++;
209   }
210 
211   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
212    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
213    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
214    * then xgroup[] must consist of (maxnz + 2) elements, since the last
215    * element of xgroup will tell us where the (maxnz + 1)th group ends.
216    * We allocate space for the maximum number of groups;
217    * that is potentially a little wasteful, but not too much so.
218    * Perhaps I should fix it later. */
219   PetscCall(PetscMalloc1(maxnz+2, &aijperm->xgroup));
220   PetscCall(PetscMalloc1(maxnz+1, &aijperm->nzgroup));
221 
222   /* Second pass.  Look at what is in the buckets and create the groupings.
223    * Note that it is OK to have a group of rows with no non-zero values. */
224   ngroup = 0;
225   istart = 0;
226   for (i=0; i<=maxnz; i++) {
227     if (rows_in_bucket[i] > 0) {
228       aijperm->nzgroup[ngroup] = i;
229       aijperm->xgroup[ngroup]  = istart;
230       ngroup++;
231       istart += rows_in_bucket[i];
232     }
233   }
234 
235   aijperm->xgroup[ngroup] = istart;
236   aijperm->ngroup         = ngroup;
237 
238   /* Now fill in the permutation vector iperm. */
239   ipnz[0] = 0;
240   for (i=0; i<maxnz; i++) {
241     ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
242   }
243 
244   for (i=0; i<m; i++) {
245     nz                   = nz_in_row[i];
246     ipos                 = ipnz[nz];
247     aijperm->iperm[ipos] = i;
248     ipnz[nz]++;
249   }
250 
251   /* Clean up temporary work arrays. */
252   PetscCall(PetscFree(rows_in_bucket));
253   PetscCall(PetscFree(ipnz));
254   PetscCall(PetscFree(nz_in_row));
255   PetscFunctionReturn(0);
256 }
257 
258 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
259 {
260   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
261 
262   PetscFunctionBegin;
263   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
264 
265   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
266    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
267    * I'm not sure if this is the best way to do this, but it avoids
268    * a lot of code duplication.
269    * I also note that currently MATSEQAIJPERM doesn't know anything about
270    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
271    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
272    * this, this may break things.  (Don't know... haven't looked at it.) */
273   a->inode.use = PETSC_FALSE;
274   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
275 
276   /* Now calculate the permutation and grouping information. */
277   PetscCall(MatSeqAIJPERM_create_perm(A));
278   PetscFunctionReturn(0);
279 }
280 
281 PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)
282 {
283   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
284   const PetscScalar *x;
285   PetscScalar       *y;
286   const MatScalar   *aa;
287   const PetscInt    *aj,*ai;
288 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
289   PetscInt          i,j;
290 #endif
291 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
292   __m512d           vec_x,vec_y,vec_vals;
293   __m256i           vec_idx,vec_ipos,vec_j;
294   __mmask8           mask;
295 #endif
296 
297   /* Variables that don't appear in MatMult_SeqAIJ. */
298   Mat_SeqAIJPERM    *aijperm = (Mat_SeqAIJPERM*) A->spptr;
299   PetscInt          *iperm;  /* Points to the permutation vector. */
300   PetscInt          *xgroup;
301   /* Denotes where groups of rows with same number of nonzeros
302    * begin and end in iperm. */
303   PetscInt          *nzgroup;
304   PetscInt          ngroup;
305   PetscInt          igroup;
306   PetscInt          jstart,jend;
307   /* jstart is used in loops to denote the position in iperm where a
308    * group starts; jend denotes the position where it ends.
309    * (jend + 1 is where the next group starts.) */
310   PetscInt          iold,nz;
311   PetscInt          istart,iend,isize;
312   PetscInt          ipos;
313   PetscScalar       yp[NDIM];
314   PetscInt          ip[NDIM];    /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
315 
316 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
317 #pragma disjoint(*x,*y,*aa)
318 #endif
319 
320   PetscFunctionBegin;
321   PetscCall(VecGetArrayRead(xx,&x));
322   PetscCall(VecGetArray(yy,&y));
323   aj   = a->j;   /* aj[k] gives column index for element aa[k]. */
324   aa   = a->a; /* Nonzero elements stored row-by-row. */
325   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
326 
327   /* Get the info we need about the permutations and groupings. */
328   iperm   = aijperm->iperm;
329   ngroup  = aijperm->ngroup;
330   xgroup  = aijperm->xgroup;
331   nzgroup = aijperm->nzgroup;
332 
333 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
334   fortranmultaijperm_(&m,x,ii,aj,aa,y);
335 #else
336 
337   for (igroup=0; igroup<ngroup; igroup++) {
338     jstart = xgroup[igroup];
339     jend   = xgroup[igroup+1] - 1;
340     nz     = nzgroup[igroup];
341 
342     /* Handle the special cases where the number of nonzeros per row
343      * in the group is either 0 or 1. */
344     if (nz == 0) {
345       for (i=jstart; i<=jend; i++) {
346         y[iperm[i]] = 0.0;
347       }
348     } else if (nz == 1) {
349       for (i=jstart; i<=jend; i++) {
350         iold    = iperm[i];
351         ipos    = ai[iold];
352         y[iold] = aa[ipos] * x[aj[ipos]];
353       }
354     } else {
355 
356       /* We work our way through the current group in chunks of NDIM rows
357        * at a time. */
358 
359       for (istart=jstart; istart<=jend; istart+=NDIM) {
360         /* Figure out where the chunk of 'isize' rows ends in iperm.
361          * 'isize may of course be less than NDIM for the last chunk. */
362         iend = istart + (NDIM - 1);
363 
364         if (iend > jend) iend = jend;
365 
366         isize = iend - istart + 1;
367 
368         /* Initialize the yp[] array that will be used to hold part of
369          * the permuted results vector, and figure out where in aa each
370          * row of the chunk will begin. */
371         for (i=0; i<isize; i++) {
372           iold = iperm[istart + i];
373           /* iold is a row number from the matrix A *before* reordering. */
374           ip[i] = ai[iold];
375           /* ip[i] tells us where the ith row of the chunk begins in aa. */
376           yp[i] = (PetscScalar) 0.0;
377         }
378 
379         /* If the number of zeros per row exceeds the number of rows in
380          * the chunk, we should vectorize along nz, that is, perform the
381          * mat-vec one row at a time as in the usual CSR case. */
382         if (nz > isize) {
383 #if defined(PETSC_HAVE_CRAY_VECTOR)
384 #pragma _CRI preferstream
385 #endif
386           for (i=0; i<isize; i++) {
387 #if defined(PETSC_HAVE_CRAY_VECTOR)
388 #pragma _CRI prefervector
389 #endif
390 
391 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
392             vec_y = _mm512_setzero_pd();
393             ipos = ip[i];
394             for (j=0; j<(nz>>3); j++) {
395               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
396               vec_vals = _mm512_loadu_pd(&aa[ipos]);
397               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
398               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
399               ipos += 8;
400             }
401             if ((nz&0x07)>2) {
402               mask     = (__mmask8)(0xff >> (8-(nz&0x07)));
403               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
404               vec_vals = _mm512_loadu_pd(&aa[ipos]);
405               vec_x    = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8);
406               vec_y    = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask);
407             } else if ((nz&0x07)==2) {
408               yp[i] += aa[ipos]*x[aj[ipos]];
409               yp[i] += aa[ipos+1]*x[aj[ipos+1]];
410             } else if ((nz&0x07)==1) {
411               yp[i] += aa[ipos]*x[aj[ipos]];
412             }
413             yp[i] += _mm512_reduce_add_pd(vec_y);
414 #else
415             for (j=0; j<nz; j++) {
416               ipos   = ip[i] + j;
417               yp[i] += aa[ipos] * x[aj[ipos]];
418             }
419 #endif
420           }
421         } else {
422           /* Otherwise, there are enough rows in the chunk to make it
423            * worthwhile to vectorize across the rows, that is, to do the
424            * matvec by operating with "columns" of the chunk. */
425           for (j=0; j<nz; j++) {
426 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
427             vec_j = _mm256_set1_epi32(j);
428             for (i=0; i<((isize>>3)<<3); i+=8) {
429               vec_y    = _mm512_loadu_pd(&yp[i]);
430               vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]);
431               vec_ipos = _mm256_add_epi32(vec_ipos,vec_j);
432               vec_idx  = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4);
433               vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8);
434               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
435               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
436               _mm512_storeu_pd(&yp[i],vec_y);
437             }
438             for (i=isize-(isize&0x07); i<isize; i++) {
439               ipos = ip[i]+j;
440               yp[i] += aa[ipos]*x[aj[ipos]];
441             }
442 #else
443             for (i=0; i<isize; i++) {
444               ipos   = ip[i] + j;
445               yp[i] += aa[ipos] * x[aj[ipos]];
446             }
447 #endif
448           }
449         }
450 
451 #if defined(PETSC_HAVE_CRAY_VECTOR)
452 #pragma _CRI ivdep
453 #endif
454         /* Put results from yp[] into non-permuted result vector y. */
455         for (i=0; i<isize; i++) {
456           y[iperm[istart+i]] = yp[i];
457         }
458       } /* End processing chunk of isize rows of a group. */
459     } /* End handling matvec for chunk with nz > 1. */
460   } /* End loop over igroup. */
461 #endif
462   PetscCall(PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0)));
463   PetscCall(VecRestoreArrayRead(xx,&x));
464   PetscCall(VecRestoreArray(yy,&y));
465   PetscFunctionReturn(0);
466 }
467 
468 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
469  * Note that the names I used to designate the vectors differs from that
470  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
471  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
472 /*
473     I hate having virtually identical code for the mult and the multadd!!!
474 */
475 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)
476 {
477   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
478   const PetscScalar *x;
479   PetscScalar       *y,*w;
480   const MatScalar   *aa;
481   const PetscInt    *aj,*ai;
482 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
483   PetscInt i,j;
484 #endif
485 
486   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
487   Mat_SeqAIJPERM * aijperm;
488   PetscInt       *iperm;    /* Points to the permutation vector. */
489   PetscInt       *xgroup;
490   /* Denotes where groups of rows with same number of nonzeros
491    * begin and end in iperm. */
492   PetscInt *nzgroup;
493   PetscInt ngroup;
494   PetscInt igroup;
495   PetscInt jstart,jend;
496   /* jstart is used in loops to denote the position in iperm where a
497    * group starts; jend denotes the position where it ends.
498    * (jend + 1 is where the next group starts.) */
499   PetscInt    iold,nz;
500   PetscInt    istart,iend,isize;
501   PetscInt    ipos;
502   PetscScalar yp[NDIM];
503   PetscInt    ip[NDIM];
504   /* yp[] and ip[] are treated as vector "registers" for performing
505    * the mat-vec. */
506 
507 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
508 #pragma disjoint(*x,*y,*aa)
509 #endif
510 
511   PetscFunctionBegin;
512   PetscCall(VecGetArrayRead(xx,&x));
513   PetscCall(VecGetArrayPair(yy,ww,&y,&w));
514 
515   aj = a->j;   /* aj[k] gives column index for element aa[k]. */
516   aa = a->a;   /* Nonzero elements stored row-by-row. */
517   ai = a->i;   /* ai[k] is the position in aa and aj where row k starts. */
518 
519   /* Get the info we need about the permutations and groupings. */
520   aijperm = (Mat_SeqAIJPERM*) A->spptr;
521   iperm   = aijperm->iperm;
522   ngroup  = aijperm->ngroup;
523   xgroup  = aijperm->xgroup;
524   nzgroup = aijperm->nzgroup;
525 
526 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
527   fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w);
528 #else
529 
530   for (igroup=0; igroup<ngroup; igroup++) {
531     jstart = xgroup[igroup];
532     jend   = xgroup[igroup+1] - 1;
533 
534     nz = nzgroup[igroup];
535 
536     /* Handle the special cases where the number of nonzeros per row
537      * in the group is either 0 or 1. */
538     if (nz == 0) {
539       for (i=jstart; i<=jend; i++) {
540         iold    = iperm[i];
541         y[iold] = w[iold];
542       }
543     }
544     else if (nz == 1) {
545       for (i=jstart; i<=jend; i++) {
546         iold    = iperm[i];
547         ipos    = ai[iold];
548         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
549       }
550     }
551     /* For the general case: */
552     else {
553 
554       /* We work our way through the current group in chunks of NDIM rows
555        * at a time. */
556 
557       for (istart=jstart; istart<=jend; istart+=NDIM) {
558         /* Figure out where the chunk of 'isize' rows ends in iperm.
559          * 'isize may of course be less than NDIM for the last chunk. */
560         iend = istart + (NDIM - 1);
561         if (iend > jend) iend = jend;
562         isize = iend - istart + 1;
563 
564         /* Initialize the yp[] array that will be used to hold part of
565          * the permuted results vector, and figure out where in aa each
566          * row of the chunk will begin. */
567         for (i=0; i<isize; i++) {
568           iold = iperm[istart + i];
569           /* iold is a row number from the matrix A *before* reordering. */
570           ip[i] = ai[iold];
571           /* ip[i] tells us where the ith row of the chunk begins in aa. */
572           yp[i] = w[iold];
573         }
574 
575         /* If the number of zeros per row exceeds the number of rows in
576          * the chunk, we should vectorize along nz, that is, perform the
577          * mat-vec one row at a time as in the usual CSR case. */
578         if (nz > isize) {
579 #if defined(PETSC_HAVE_CRAY_VECTOR)
580 #pragma _CRI preferstream
581 #endif
582           for (i=0; i<isize; i++) {
583 #if defined(PETSC_HAVE_CRAY_VECTOR)
584 #pragma _CRI prefervector
585 #endif
586             for (j=0; j<nz; j++) {
587               ipos   = ip[i] + j;
588               yp[i] += aa[ipos] * x[aj[ipos]];
589             }
590           }
591         }
592         /* Otherwise, there are enough rows in the chunk to make it
593          * worthwhile to vectorize across the rows, that is, to do the
594          * matvec by operating with "columns" of the chunk. */
595         else {
596           for (j=0; j<nz; j++) {
597             for (i=0; i<isize; i++) {
598               ipos   = ip[i] + j;
599               yp[i] += aa[ipos] * x[aj[ipos]];
600             }
601           }
602         }
603 
604 #if defined(PETSC_HAVE_CRAY_VECTOR)
605 #pragma _CRI ivdep
606 #endif
607         /* Put results from yp[] into non-permuted result vector y. */
608         for (i=0; i<isize; i++) {
609           y[iperm[istart+i]] = yp[i];
610         }
611       } /* End processing chunk of isize rows of a group. */
612 
613     } /* End handling matvec for chunk with nz > 1. */
614   } /* End loop over igroup. */
615 
616 #endif
617   PetscCall(PetscLogFlops(2.0*a->nz));
618   PetscCall(VecRestoreArrayRead(xx,&x));
619   PetscCall(VecRestoreArrayPair(yy,ww,&y,&w));
620   PetscFunctionReturn(0);
621 }
622 
623 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
624  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
625  * routine, but can also be used to convert an assembled SeqAIJ matrix
626  * into a SeqAIJPERM one. */
627 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat)
628 {
629   Mat            B = *newmat;
630   Mat_SeqAIJPERM *aijperm;
631   PetscBool      sametype;
632 
633   PetscFunctionBegin;
634   if (reuse == MAT_INITIAL_MATRIX) {
635     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,&B));
636   }
637   PetscCall(PetscObjectTypeCompare((PetscObject)A,type,&sametype));
638   if (sametype) PetscFunctionReturn(0);
639 
640   PetscCall(PetscNewLog(B,&aijperm));
641   B->spptr = (void*) aijperm;
642 
643   /* Set function pointers for methods that we inherit from AIJ but override. */
644   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
645   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
646   B->ops->destroy     = MatDestroy_SeqAIJPERM;
647   B->ops->mult        = MatMult_SeqAIJPERM;
648   B->ops->multadd     = MatMultAdd_SeqAIJPERM;
649 
650   aijperm->nonzerostate = -1;  /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
651   /* If A has already been assembled, compute the permutation. */
652   if (A->assembled) PetscCall(MatSeqAIJPERM_create_perm(B));
653 
654   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ));
655 
656   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM));
657   *newmat = B;
658   PetscFunctionReturn(0);
659 }
660 
661 /*@C
662    MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM.
663    This type inherits from AIJ, but calculates some additional permutation
664    information that is used to allow better vectorization of some
665    operations.  At the cost of increased storage, the AIJ formatted
666    matrix can be copied to a format in which pieces of the matrix are
667    stored in ELLPACK format, allowing the vectorized matrix multiply
668    routine to use stride-1 memory accesses.  As with the AIJ type, it is
669    important to preallocate matrix storage in order to get good assembly
670    performance.
671 
672    Collective
673 
674    Input Parameters:
675 +  comm - MPI communicator, set to PETSC_COMM_SELF
676 .  m - number of rows
677 .  n - number of columns
678 .  nz - number of nonzeros per row (same for all rows)
679 -  nnz - array containing the number of nonzeros in the various rows
680          (possibly different for each row) or NULL
681 
682    Output Parameter:
683 .  A - the matrix
684 
685    Notes:
686    If nnz is given then nz is ignored
687 
688    Level: intermediate
689 
690 .seealso: `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()`
691 @*/
692 PetscErrorCode  MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
693 {
694   PetscFunctionBegin;
695   PetscCall(MatCreate(comm,A));
696   PetscCall(MatSetSizes(*A,m,n,m,n));
697   PetscCall(MatSetType(*A,MATSEQAIJPERM));
698   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz));
699   PetscFunctionReturn(0);
700 }
701 
702 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
703 {
704   PetscFunctionBegin;
705   PetscCall(MatSetType(A,MATSEQAIJ));
706   PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A));
707   PetscFunctionReturn(0);
708 }
709