xref: /petsc/src/mat/impls/aij/seq/aijperm/aijperm.c (revision d2522c19e8fa9bca20aaca277941d9a63e71db6a)
1 
2 /*
3   Defines basic operations for the MATSEQAIJPERM matrix class.
4   This class is derived from the MATSEQAIJ class and retains the
5   compressed row storage (aka Yale sparse matrix format) but augments
6   it with some permutation information that enables some operations
7   to be more vectorizable.  A physically rearranged copy of the matrix
8   may be stored if the user desires.
9 
10   Eventually a variety of permutations may be supported.
11 */
12 
13 #include <../src/mat/impls/aij/seq/aij.h>
14 
15 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16 #include <immintrin.h>
17 
18 #if !defined(_MM_SCALE_8)
19 #define _MM_SCALE_8 8
20 #endif
21 #if !defined(_MM_SCALE_4)
22 #define _MM_SCALE_4 4
23 #endif
24 #endif
25 
26 #define NDIM 512
27 /* NDIM specifies how many rows at a time we should work with when
28  * performing the vectorized mat-vec.  This depends on various factors
29  * such as vector register length, etc., and I really need to add a
30  * way for the user (or the library) to tune this.  I'm setting it to
31  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32  * routines. */
33 
34 typedef struct {
35   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
36 
37   PetscInt  ngroup;
38   PetscInt *xgroup;
39   /* Denotes where groups of rows with same number of nonzeros
40    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41    * where the ith group begins. */
42 
43   PetscInt *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
44   PetscInt *iperm;   /* The permutation vector. */
45 
46   /* Some of this stuff is for Ed's recursive triangular solve.
47    * I'm not sure what I need yet. */
48   PetscInt   blocksize;
49   PetscInt   nstep;
50   PetscInt  *jstart_list;
51   PetscInt  *jend_list;
52   PetscInt  *action_list;
53   PetscInt  *ngroup_list;
54   PetscInt **ipointer_list;
55   PetscInt **xgroup_list;
56   PetscInt **nzgroup_list;
57   PetscInt **iperm_list;
58 } Mat_SeqAIJPERM;
59 
60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat) {
61   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
62   /* so we will ignore 'MatType type'. */
63   Mat             B       = *newmat;
64   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
65 
66   PetscFunctionBegin;
67   if (reuse == MAT_INITIAL_MATRIX) {
68     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
69     aijperm = (Mat_SeqAIJPERM *)B->spptr;
70   }
71 
72   /* Reset the original function pointers. */
73   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
74   B->ops->destroy     = MatDestroy_SeqAIJ;
75   B->ops->duplicate   = MatDuplicate_SeqAIJ;
76   B->ops->mult        = MatMult_SeqAIJ;
77   B->ops->multadd     = MatMultAdd_SeqAIJ;
78 
79   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", NULL));
80 
81   /* Free everything in the Mat_SeqAIJPERM data structure.*/
82   PetscCall(PetscFree(aijperm->xgroup));
83   PetscCall(PetscFree(aijperm->nzgroup));
84   PetscCall(PetscFree(aijperm->iperm));
85   PetscCall(PetscFree(B->spptr));
86 
87   /* Change the type of B to MATSEQAIJ. */
88   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ));
89 
90   *newmat = B;
91   PetscFunctionReturn(0);
92 }
93 
94 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A) {
95   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
96 
97   PetscFunctionBegin;
98   if (aijperm) {
99     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
100     PetscCall(PetscFree(aijperm->xgroup));
101     PetscCall(PetscFree(aijperm->nzgroup));
102     PetscCall(PetscFree(aijperm->iperm));
103     PetscCall(PetscFree(A->spptr));
104   }
105   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
106    * to destroy everything that remains. */
107   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ));
108   /* Note that I don't call MatSetType().  I believe this is because that
109    * is only to be called when *building* a matrix.  I could be wrong, but
110    * that is how things work for the SuperLU matrix class. */
111   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijperm_seqaij_C", NULL));
112   PetscCall(MatDestroy_SeqAIJ(A));
113   PetscFunctionReturn(0);
114 }
115 
116 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M) {
117   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
118   Mat_SeqAIJPERM *aijperm_dest;
119   PetscBool       perm;
120 
121   PetscFunctionBegin;
122   PetscCall(MatDuplicate_SeqAIJ(A, op, M));
123   PetscCall(PetscObjectTypeCompare((PetscObject)*M, MATSEQAIJPERM, &perm));
124   if (perm) {
125     aijperm_dest = (Mat_SeqAIJPERM *)(*M)->spptr;
126     PetscCall(PetscFree(aijperm_dest->xgroup));
127     PetscCall(PetscFree(aijperm_dest->nzgroup));
128     PetscCall(PetscFree(aijperm_dest->iperm));
129   } else {
130     PetscCall(PetscNewLog(*M, &aijperm_dest));
131     (*M)->spptr = (void *)aijperm_dest;
132     PetscCall(PetscObjectChangeTypeName((PetscObject)*M, MATSEQAIJPERM));
133     PetscCall(PetscObjectComposeFunction((PetscObject)*M, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
134   }
135   PetscCall(PetscArraycpy(aijperm_dest, aijperm, 1));
136   /* Allocate space for, and copy the grouping and permutation info.
137    * I note that when the groups are initially determined in
138    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
139    * necessary.  But at this point, we know how large they need to be, and
140    * allocate only the necessary amount of memory.  So the duplicated matrix
141    * may actually use slightly less storage than the original! */
142   PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm));
143   PetscCall(PetscMalloc1(aijperm->ngroup + 1, &aijperm_dest->xgroup));
144   PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup));
145   PetscCall(PetscArraycpy(aijperm_dest->iperm, aijperm->iperm, A->rmap->n));
146   PetscCall(PetscArraycpy(aijperm_dest->xgroup, aijperm->xgroup, aijperm->ngroup + 1));
147   PetscCall(PetscArraycpy(aijperm_dest->nzgroup, aijperm->nzgroup, aijperm->ngroup));
148   PetscFunctionReturn(0);
149 }
150 
151 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A) {
152   Mat_SeqAIJ     *a       = (Mat_SeqAIJ *)(A)->data;
153   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
154   PetscInt        m;     /* Number of rows in the matrix. */
155   PetscInt       *ia;    /* From the CSR representation; points to the beginning  of each row. */
156   PetscInt        maxnz; /* Maximum number of nonzeros in any row. */
157   PetscInt       *rows_in_bucket;
158   /* To construct the permutation, we sort each row into one of maxnz
159    * buckets based on how many nonzeros are in the row. */
160   PetscInt        nz;
161   PetscInt       *nz_in_row; /* the number of nonzero elements in row k. */
162   PetscInt       *ipnz;
163   /* When constructing the iperm permutation vector,
164    * ipnz[nz] is used to point to the next place in the permutation vector
165    * that a row with nz nonzero elements should be placed.*/
166   PetscInt        i, ngroup, istart, ipos;
167 
168   PetscFunctionBegin;
169   if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */
170   aijperm->nonzerostate = A->nonzerostate;
171   /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
172   PetscCall(PetscFree(aijperm->xgroup));
173   PetscCall(PetscFree(aijperm->nzgroup));
174   PetscCall(PetscFree(aijperm->iperm));
175 
176   m  = A->rmap->n;
177   ia = a->i;
178 
179   /* Allocate the arrays that will hold the permutation vector. */
180   PetscCall(PetscMalloc1(m, &aijperm->iperm));
181 
182   /* Allocate some temporary work arrays that will be used in
183    * calculating the permutation vector and groupings. */
184   PetscCall(PetscMalloc1(m, &nz_in_row));
185 
186   /* Now actually figure out the permutation and grouping. */
187 
188   /* First pass: Determine number of nonzeros in each row, maximum
189    * number of nonzeros in any row, and how many rows fall into each
190    * "bucket" of rows with same number of nonzeros. */
191   maxnz = 0;
192   for (i = 0; i < m; i++) {
193     nz_in_row[i] = ia[i + 1] - ia[i];
194     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
195   }
196   PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &rows_in_bucket));
197   PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &ipnz));
198 
199   for (i = 0; i <= maxnz; i++) { rows_in_bucket[i] = 0; }
200   for (i = 0; i < m; i++) {
201     nz = nz_in_row[i];
202     rows_in_bucket[nz]++;
203   }
204 
205   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
206    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
207    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
208    * then xgroup[] must consist of (maxnz + 2) elements, since the last
209    * element of xgroup will tell us where the (maxnz + 1)th group ends.
210    * We allocate space for the maximum number of groups;
211    * that is potentially a little wasteful, but not too much so.
212    * Perhaps I should fix it later. */
213   PetscCall(PetscMalloc1(maxnz + 2, &aijperm->xgroup));
214   PetscCall(PetscMalloc1(maxnz + 1, &aijperm->nzgroup));
215 
216   /* Second pass.  Look at what is in the buckets and create the groupings.
217    * Note that it is OK to have a group of rows with no non-zero values. */
218   ngroup = 0;
219   istart = 0;
220   for (i = 0; i <= maxnz; i++) {
221     if (rows_in_bucket[i] > 0) {
222       aijperm->nzgroup[ngroup] = i;
223       aijperm->xgroup[ngroup]  = istart;
224       ngroup++;
225       istart += rows_in_bucket[i];
226     }
227   }
228 
229   aijperm->xgroup[ngroup] = istart;
230   aijperm->ngroup         = ngroup;
231 
232   /* Now fill in the permutation vector iperm. */
233   ipnz[0] = 0;
234   for (i = 0; i < maxnz; i++) { ipnz[i + 1] = ipnz[i] + rows_in_bucket[i]; }
235 
236   for (i = 0; i < m; i++) {
237     nz                   = nz_in_row[i];
238     ipos                 = ipnz[nz];
239     aijperm->iperm[ipos] = i;
240     ipnz[nz]++;
241   }
242 
243   /* Clean up temporary work arrays. */
244   PetscCall(PetscFree(rows_in_bucket));
245   PetscCall(PetscFree(ipnz));
246   PetscCall(PetscFree(nz_in_row));
247   PetscFunctionReturn(0);
248 }
249 
250 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode) {
251   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
252 
253   PetscFunctionBegin;
254   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
255 
256   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
257    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
258    * I'm not sure if this is the best way to do this, but it avoids
259    * a lot of code duplication.
260    * I also note that currently MATSEQAIJPERM doesn't know anything about
261    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
262    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
263    * this, this may break things.  (Don't know... haven't looked at it.) */
264   a->inode.use = PETSC_FALSE;
265   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
266 
267   /* Now calculate the permutation and grouping information. */
268   PetscCall(MatSeqAIJPERM_create_perm(A));
269   PetscFunctionReturn(0);
270 }
271 
272 PetscErrorCode MatMult_SeqAIJPERM(Mat A, Vec xx, Vec yy) {
273   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
274   const PetscScalar *x;
275   PetscScalar       *y;
276   const MatScalar   *aa;
277   const PetscInt    *aj, *ai;
278 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
279   PetscInt i, j;
280 #endif
281 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
282   __m512d  vec_x, vec_y, vec_vals;
283   __m256i  vec_idx, vec_ipos, vec_j;
284   __mmask8 mask;
285 #endif
286 
287   /* Variables that don't appear in MatMult_SeqAIJ. */
288   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
289   PetscInt       *iperm; /* Points to the permutation vector. */
290   PetscInt       *xgroup;
291   /* Denotes where groups of rows with same number of nonzeros
292    * begin and end in iperm. */
293   PetscInt       *nzgroup;
294   PetscInt        ngroup;
295   PetscInt        igroup;
296   PetscInt        jstart, jend;
297   /* jstart is used in loops to denote the position in iperm where a
298    * group starts; jend denotes the position where it ends.
299    * (jend + 1 is where the next group starts.) */
300   PetscInt        iold, nz;
301   PetscInt        istart, iend, isize;
302   PetscInt        ipos;
303   PetscScalar     yp[NDIM];
304   PetscInt        ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
305 
306 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
307 #pragma disjoint(*x, *y, *aa)
308 #endif
309 
310   PetscFunctionBegin;
311   PetscCall(VecGetArrayRead(xx, &x));
312   PetscCall(VecGetArray(yy, &y));
313   aj = a->j; /* aj[k] gives column index for element aa[k]. */
314   aa = a->a; /* Nonzero elements stored row-by-row. */
315   ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
316 
317   /* Get the info we need about the permutations and groupings. */
318   iperm   = aijperm->iperm;
319   ngroup  = aijperm->ngroup;
320   xgroup  = aijperm->xgroup;
321   nzgroup = aijperm->nzgroup;
322 
323 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
324   fortranmultaijperm_(&m, x, ii, aj, aa, y);
325 #else
326 
327   for (igroup = 0; igroup < ngroup; igroup++) {
328     jstart = xgroup[igroup];
329     jend   = xgroup[igroup + 1] - 1;
330     nz     = nzgroup[igroup];
331 
332     /* Handle the special cases where the number of nonzeros per row
333      * in the group is either 0 or 1. */
334     if (nz == 0) {
335       for (i = jstart; i <= jend; i++) { y[iperm[i]] = 0.0; }
336     } else if (nz == 1) {
337       for (i = jstart; i <= jend; i++) {
338         iold    = iperm[i];
339         ipos    = ai[iold];
340         y[iold] = aa[ipos] * x[aj[ipos]];
341       }
342     } else {
343       /* We work our way through the current group in chunks of NDIM rows
344        * at a time. */
345 
346       for (istart = jstart; istart <= jend; istart += NDIM) {
347         /* Figure out where the chunk of 'isize' rows ends in iperm.
348          * 'isize may of course be less than NDIM for the last chunk. */
349         iend = istart + (NDIM - 1);
350 
351         if (iend > jend) iend = jend;
352 
353         isize = iend - istart + 1;
354 
355         /* Initialize the yp[] array that will be used to hold part of
356          * the permuted results vector, and figure out where in aa each
357          * row of the chunk will begin. */
358         for (i = 0; i < isize; i++) {
359           iold  = iperm[istart + i];
360           /* iold is a row number from the matrix A *before* reordering. */
361           ip[i] = ai[iold];
362           /* ip[i] tells us where the ith row of the chunk begins in aa. */
363           yp[i] = (PetscScalar)0.0;
364         }
365 
366         /* If the number of zeros per row exceeds the number of rows in
367          * the chunk, we should vectorize along nz, that is, perform the
368          * mat-vec one row at a time as in the usual CSR case. */
369         if (nz > isize) {
370 #if defined(PETSC_HAVE_CRAY_VECTOR)
371 #pragma _CRI preferstream
372 #endif
373           for (i = 0; i < isize; i++) {
374 #if defined(PETSC_HAVE_CRAY_VECTOR)
375 #pragma _CRI prefervector
376 #endif
377 
378 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
379             vec_y = _mm512_setzero_pd();
380             ipos  = ip[i];
381             for (j = 0; j < (nz >> 3); j++) {
382               vec_idx  = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
383               vec_vals = _mm512_loadu_pd(&aa[ipos]);
384               vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
385               vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
386               ipos += 8;
387             }
388             if ((nz & 0x07) > 2) {
389               mask     = (__mmask8)(0xff >> (8 - (nz & 0x07)));
390               vec_idx  = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
391               vec_vals = _mm512_loadu_pd(&aa[ipos]);
392               vec_x    = _mm512_mask_i32gather_pd(vec_x, mask, vec_idx, x, _MM_SCALE_8);
393               vec_y    = _mm512_mask3_fmadd_pd(vec_x, vec_vals, vec_y, mask);
394             } else if ((nz & 0x07) == 2) {
395               yp[i] += aa[ipos] * x[aj[ipos]];
396               yp[i] += aa[ipos + 1] * x[aj[ipos + 1]];
397             } else if ((nz & 0x07) == 1) {
398               yp[i] += aa[ipos] * x[aj[ipos]];
399             }
400             yp[i] += _mm512_reduce_add_pd(vec_y);
401 #else
402             for (j = 0; j < nz; j++) {
403               ipos = ip[i] + j;
404               yp[i] += aa[ipos] * x[aj[ipos]];
405             }
406 #endif
407           }
408         } else {
409           /* Otherwise, there are enough rows in the chunk to make it
410            * worthwhile to vectorize across the rows, that is, to do the
411            * matvec by operating with "columns" of the chunk. */
412           for (j = 0; j < nz; j++) {
413 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
414             vec_j = _mm256_set1_epi32(j);
415             for (i = 0; i < ((isize >> 3) << 3); i += 8) {
416               vec_y    = _mm512_loadu_pd(&yp[i]);
417               vec_ipos = _mm256_loadu_si256((__m256i const *)&ip[i]);
418               vec_ipos = _mm256_add_epi32(vec_ipos, vec_j);
419               vec_idx  = _mm256_i32gather_epi32(aj, vec_ipos, _MM_SCALE_4);
420               vec_vals = _mm512_i32gather_pd(vec_ipos, aa, _MM_SCALE_8);
421               vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
422               vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
423               _mm512_storeu_pd(&yp[i], vec_y);
424             }
425             for (i = isize - (isize & 0x07); i < isize; i++) {
426               ipos = ip[i] + j;
427               yp[i] += aa[ipos] * x[aj[ipos]];
428             }
429 #else
430             for (i = 0; i < isize; i++) {
431               ipos = ip[i] + j;
432               yp[i] += aa[ipos] * x[aj[ipos]];
433             }
434 #endif
435           }
436         }
437 
438 #if defined(PETSC_HAVE_CRAY_VECTOR)
439 #pragma _CRI ivdep
440 #endif
441         /* Put results from yp[] into non-permuted result vector y. */
442         for (i = 0; i < isize; i++) { y[iperm[istart + i]] = yp[i]; }
443       } /* End processing chunk of isize rows of a group. */
444     }   /* End handling matvec for chunk with nz > 1. */
445   }     /* End loop over igroup. */
446 #endif
447   PetscCall(PetscLogFlops(PetscMax(2.0 * a->nz - A->rmap->n, 0)));
448   PetscCall(VecRestoreArrayRead(xx, &x));
449   PetscCall(VecRestoreArray(yy, &y));
450   PetscFunctionReturn(0);
451 }
452 
453 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
454  * Note that the names I used to designate the vectors differs from that
455  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
456  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
457 /*
458     I hate having virtually identical code for the mult and the multadd!!!
459 */
460 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A, Vec xx, Vec ww, Vec yy) {
461   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
462   const PetscScalar *x;
463   PetscScalar       *y, *w;
464   const MatScalar   *aa;
465   const PetscInt    *aj, *ai;
466 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
467   PetscInt i, j;
468 #endif
469 
470   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
471   Mat_SeqAIJPERM *aijperm;
472   PetscInt       *iperm; /* Points to the permutation vector. */
473   PetscInt       *xgroup;
474   /* Denotes where groups of rows with same number of nonzeros
475    * begin and end in iperm. */
476   PetscInt       *nzgroup;
477   PetscInt        ngroup;
478   PetscInt        igroup;
479   PetscInt        jstart, jend;
480   /* jstart is used in loops to denote the position in iperm where a
481    * group starts; jend denotes the position where it ends.
482    * (jend + 1 is where the next group starts.) */
483   PetscInt        iold, nz;
484   PetscInt        istart, iend, isize;
485   PetscInt        ipos;
486   PetscScalar     yp[NDIM];
487   PetscInt        ip[NDIM];
488   /* yp[] and ip[] are treated as vector "registers" for performing
489    * the mat-vec. */
490 
491 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
492 #pragma disjoint(*x, *y, *aa)
493 #endif
494 
495   PetscFunctionBegin;
496   PetscCall(VecGetArrayRead(xx, &x));
497   PetscCall(VecGetArrayPair(yy, ww, &y, &w));
498 
499   aj = a->j; /* aj[k] gives column index for element aa[k]. */
500   aa = a->a; /* Nonzero elements stored row-by-row. */
501   ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
502 
503   /* Get the info we need about the permutations and groupings. */
504   aijperm = (Mat_SeqAIJPERM *)A->spptr;
505   iperm   = aijperm->iperm;
506   ngroup  = aijperm->ngroup;
507   xgroup  = aijperm->xgroup;
508   nzgroup = aijperm->nzgroup;
509 
510 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
511   fortranmultaddaijperm_(&m, x, ii, aj, aa, y, w);
512 #else
513 
514   for (igroup = 0; igroup < ngroup; igroup++) {
515     jstart = xgroup[igroup];
516     jend   = xgroup[igroup + 1] - 1;
517 
518     nz = nzgroup[igroup];
519 
520     /* Handle the special cases where the number of nonzeros per row
521      * in the group is either 0 or 1. */
522     if (nz == 0) {
523       for (i = jstart; i <= jend; i++) {
524         iold    = iperm[i];
525         y[iold] = w[iold];
526       }
527     } else if (nz == 1) {
528       for (i = jstart; i <= jend; i++) {
529         iold    = iperm[i];
530         ipos    = ai[iold];
531         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
532       }
533     }
534     /* For the general case: */
535     else {
536       /* We work our way through the current group in chunks of NDIM rows
537        * at a time. */
538 
539       for (istart = jstart; istart <= jend; istart += NDIM) {
540         /* Figure out where the chunk of 'isize' rows ends in iperm.
541          * 'isize may of course be less than NDIM for the last chunk. */
542         iend = istart + (NDIM - 1);
543         if (iend > jend) iend = jend;
544         isize = iend - istart + 1;
545 
546         /* Initialize the yp[] array that will be used to hold part of
547          * the permuted results vector, and figure out where in aa each
548          * row of the chunk will begin. */
549         for (i = 0; i < isize; i++) {
550           iold  = iperm[istart + i];
551           /* iold is a row number from the matrix A *before* reordering. */
552           ip[i] = ai[iold];
553           /* ip[i] tells us where the ith row of the chunk begins in aa. */
554           yp[i] = w[iold];
555         }
556 
557         /* If the number of zeros per row exceeds the number of rows in
558          * the chunk, we should vectorize along nz, that is, perform the
559          * mat-vec one row at a time as in the usual CSR case. */
560         if (nz > isize) {
561 #if defined(PETSC_HAVE_CRAY_VECTOR)
562 #pragma _CRI preferstream
563 #endif
564           for (i = 0; i < isize; i++) {
565 #if defined(PETSC_HAVE_CRAY_VECTOR)
566 #pragma _CRI prefervector
567 #endif
568             for (j = 0; j < nz; j++) {
569               ipos = ip[i] + j;
570               yp[i] += aa[ipos] * x[aj[ipos]];
571             }
572           }
573         }
574         /* Otherwise, there are enough rows in the chunk to make it
575          * worthwhile to vectorize across the rows, that is, to do the
576          * matvec by operating with "columns" of the chunk. */
577         else {
578           for (j = 0; j < nz; j++) {
579             for (i = 0; i < isize; i++) {
580               ipos = ip[i] + j;
581               yp[i] += aa[ipos] * x[aj[ipos]];
582             }
583           }
584         }
585 
586 #if defined(PETSC_HAVE_CRAY_VECTOR)
587 #pragma _CRI ivdep
588 #endif
589         /* Put results from yp[] into non-permuted result vector y. */
590         for (i = 0; i < isize; i++) { y[iperm[istart + i]] = yp[i]; }
591       } /* End processing chunk of isize rows of a group. */
592 
593     } /* End handling matvec for chunk with nz > 1. */
594   }   /* End loop over igroup. */
595 
596 #endif
597   PetscCall(PetscLogFlops(2.0 * a->nz));
598   PetscCall(VecRestoreArrayRead(xx, &x));
599   PetscCall(VecRestoreArrayPair(yy, ww, &y, &w));
600   PetscFunctionReturn(0);
601 }
602 
603 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
604  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
605  * routine, but can also be used to convert an assembled SeqAIJ matrix
606  * into a SeqAIJPERM one. */
607 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A, MatType type, MatReuse reuse, Mat *newmat) {
608   Mat             B = *newmat;
609   Mat_SeqAIJPERM *aijperm;
610   PetscBool       sametype;
611 
612   PetscFunctionBegin;
613   if (reuse == MAT_INITIAL_MATRIX) { PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B)); }
614   PetscCall(PetscObjectTypeCompare((PetscObject)A, type, &sametype));
615   if (sametype) PetscFunctionReturn(0);
616 
617   PetscCall(PetscNewLog(B, &aijperm));
618   B->spptr = (void *)aijperm;
619 
620   /* Set function pointers for methods that we inherit from AIJ but override. */
621   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
622   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
623   B->ops->destroy     = MatDestroy_SeqAIJPERM;
624   B->ops->mult        = MatMult_SeqAIJPERM;
625   B->ops->multadd     = MatMultAdd_SeqAIJPERM;
626 
627   aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
628   /* If A has already been assembled, compute the permutation. */
629   if (A->assembled) PetscCall(MatSeqAIJPERM_create_perm(B));
630 
631   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
632 
633   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJPERM));
634   *newmat = B;
635   PetscFunctionReturn(0);
636 }
637 
638 /*@C
639    MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM.
640    This type inherits from AIJ, but calculates some additional permutation
641    information that is used to allow better vectorization of some
642    operations.  At the cost of increased storage, the AIJ formatted
643    matrix can be copied to a format in which pieces of the matrix are
644    stored in ELLPACK format, allowing the vectorized matrix multiply
645    routine to use stride-1 memory accesses.  As with the AIJ type, it is
646    important to preallocate matrix storage in order to get good assembly
647    performance.
648 
649    Collective
650 
651    Input Parameters:
652 +  comm - MPI communicator, set to PETSC_COMM_SELF
653 .  m - number of rows
654 .  n - number of columns
655 .  nz - number of nonzeros per row (same for all rows)
656 -  nnz - array containing the number of nonzeros in the various rows
657          (possibly different for each row) or NULL
658 
659    Output Parameter:
660 .  A - the matrix
661 
662    Notes:
663    If nnz is given then nz is ignored
664 
665    Level: intermediate
666 
667 .seealso: `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()`
668 @*/
669 PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) {
670   PetscFunctionBegin;
671   PetscCall(MatCreate(comm, A));
672   PetscCall(MatSetSizes(*A, m, n, m, n));
673   PetscCall(MatSetType(*A, MATSEQAIJPERM));
674   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, nnz));
675   PetscFunctionReturn(0);
676 }
677 
678 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A) {
679   PetscFunctionBegin;
680   PetscCall(MatSetType(A, MATSEQAIJ));
681   PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A, MATSEQAIJPERM, MAT_INPLACE_MATRIX, &A));
682   PetscFunctionReturn(0);
683 }
684