xref: /petsc/src/mat/impls/aij/seq/aijperm/aijperm.c (revision 2d30e087755efd99e28fdfe792ffbeb2ee1ea928)
1 
2 /*
3   Defines basic operations for the MATSEQAIJPERM matrix class.
4   This class is derived from the MATSEQAIJ class and retains the
5   compressed row storage (aka Yale sparse matrix format) but augments
6   it with some permutation information that enables some operations
7   to be more vectorizable.  A physically rearranged copy of the matrix
8   may be stored if the user desires.
9 
10   Eventually a variety of permutations may be supported.
11 */
12 
13 #include <../src/mat/impls/aij/seq/aij.h>
14 
15 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16 #include <immintrin.h>
17 
18 #if !defined(_MM_SCALE_8)
19 #define _MM_SCALE_8 8
20 #endif
21 #if !defined(_MM_SCALE_4)
22 #define _MM_SCALE_4 4
23 #endif
24 #endif
25 
26 #define NDIM 512
27 /* NDIM specifies how many rows at a time we should work with when
28  * performing the vectorized mat-vec.  This depends on various factors
29  * such as vector register length, etc., and I really need to add a
30  * way for the user (or the library) to tune this.  I'm setting it to
31  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32  * routines. */
33 
34 typedef struct {
35   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
36 
37   PetscInt  ngroup;
38   PetscInt *xgroup;
39   /* Denotes where groups of rows with same number of nonzeros
40    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41    * where the ith group begins. */
42 
43   PetscInt *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
44   PetscInt *iperm;   /* The permutation vector. */
45 
46   /* Some of this stuff is for Ed's recursive triangular solve.
47    * I'm not sure what I need yet. */
48   PetscInt   blocksize;
49   PetscInt   nstep;
50   PetscInt  *jstart_list;
51   PetscInt  *jend_list;
52   PetscInt  *action_list;
53   PetscInt  *ngroup_list;
54   PetscInt **ipointer_list;
55   PetscInt **xgroup_list;
56   PetscInt **nzgroup_list;
57   PetscInt **iperm_list;
58 } Mat_SeqAIJPERM;
59 
60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat) {
61   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
62   /* so we will ignore 'MatType type'. */
63   Mat             B       = *newmat;
64   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
65 
66   PetscFunctionBegin;
67   if (reuse == MAT_INITIAL_MATRIX) {
68     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
69     aijperm = (Mat_SeqAIJPERM *)B->spptr;
70   }
71 
72   /* Reset the original function pointers. */
73   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
74   B->ops->destroy     = MatDestroy_SeqAIJ;
75   B->ops->duplicate   = MatDuplicate_SeqAIJ;
76   B->ops->mult        = MatMult_SeqAIJ;
77   B->ops->multadd     = MatMultAdd_SeqAIJ;
78 
79   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", NULL));
80 
81   /* Free everything in the Mat_SeqAIJPERM data structure.*/
82   PetscCall(PetscFree(aijperm->xgroup));
83   PetscCall(PetscFree(aijperm->nzgroup));
84   PetscCall(PetscFree(aijperm->iperm));
85   PetscCall(PetscFree(B->spptr));
86 
87   /* Change the type of B to MATSEQAIJ. */
88   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ));
89 
90   *newmat = B;
91   PetscFunctionReturn(0);
92 }
93 
94 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A) {
95   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
96 
97   PetscFunctionBegin;
98   if (aijperm) {
99     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
100     PetscCall(PetscFree(aijperm->xgroup));
101     PetscCall(PetscFree(aijperm->nzgroup));
102     PetscCall(PetscFree(aijperm->iperm));
103     PetscCall(PetscFree(A->spptr));
104   }
105   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
106    * to destroy everything that remains. */
107   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ));
108   /* Note that I don't call MatSetType().  I believe this is because that
109    * is only to be called when *building* a matrix.  I could be wrong, but
110    * that is how things work for the SuperLU matrix class. */
111   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijperm_seqaij_C", NULL));
112   PetscCall(MatDestroy_SeqAIJ(A));
113   PetscFunctionReturn(0);
114 }
115 
116 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M) {
117   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
118   Mat_SeqAIJPERM *aijperm_dest;
119   PetscBool       perm;
120 
121   PetscFunctionBegin;
122   PetscCall(MatDuplicate_SeqAIJ(A, op, M));
123   PetscCall(PetscObjectTypeCompare((PetscObject)*M, MATSEQAIJPERM, &perm));
124   if (perm) {
125     aijperm_dest = (Mat_SeqAIJPERM *)(*M)->spptr;
126     PetscCall(PetscFree(aijperm_dest->xgroup));
127     PetscCall(PetscFree(aijperm_dest->nzgroup));
128     PetscCall(PetscFree(aijperm_dest->iperm));
129   } else {
130     PetscCall(PetscNewLog(*M, &aijperm_dest));
131     (*M)->spptr = (void *)aijperm_dest;
132     PetscCall(PetscObjectChangeTypeName((PetscObject)*M, MATSEQAIJPERM));
133     PetscCall(PetscObjectComposeFunction((PetscObject)*M, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
134   }
135   PetscCall(PetscArraycpy(aijperm_dest, aijperm, 1));
136   /* Allocate space for, and copy the grouping and permutation info.
137    * I note that when the groups are initially determined in
138    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
139    * necessary.  But at this point, we know how large they need to be, and
140    * allocate only the necessary amount of memory.  So the duplicated matrix
141    * may actually use slightly less storage than the original! */
142   PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm));
143   PetscCall(PetscMalloc1(aijperm->ngroup + 1, &aijperm_dest->xgroup));
144   PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup));
145   PetscCall(PetscArraycpy(aijperm_dest->iperm, aijperm->iperm, A->rmap->n));
146   PetscCall(PetscArraycpy(aijperm_dest->xgroup, aijperm->xgroup, aijperm->ngroup + 1));
147   PetscCall(PetscArraycpy(aijperm_dest->nzgroup, aijperm->nzgroup, aijperm->ngroup));
148   PetscFunctionReturn(0);
149 }
150 
151 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A) {
152   Mat_SeqAIJ     *a       = (Mat_SeqAIJ *)(A)->data;
153   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
154   PetscInt        m;     /* Number of rows in the matrix. */
155   PetscInt       *ia;    /* From the CSR representation; points to the beginning  of each row. */
156   PetscInt        maxnz; /* Maximum number of nonzeros in any row. */
157   PetscInt       *rows_in_bucket;
158   /* To construct the permutation, we sort each row into one of maxnz
159    * buckets based on how many nonzeros are in the row. */
160   PetscInt        nz;
161   PetscInt       *nz_in_row; /* the number of nonzero elements in row k. */
162   PetscInt       *ipnz;
163   /* When constructing the iperm permutation vector,
164    * ipnz[nz] is used to point to the next place in the permutation vector
165    * that a row with nz nonzero elements should be placed.*/
166   PetscInt        i, ngroup, istart, ipos;
167 
168   PetscFunctionBegin;
169   if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */
170   aijperm->nonzerostate = A->nonzerostate;
171   /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
172   PetscCall(PetscFree(aijperm->xgroup));
173   PetscCall(PetscFree(aijperm->nzgroup));
174   PetscCall(PetscFree(aijperm->iperm));
175 
176   m  = A->rmap->n;
177   ia = a->i;
178 
179   /* Allocate the arrays that will hold the permutation vector. */
180   PetscCall(PetscMalloc1(m, &aijperm->iperm));
181 
182   /* Allocate some temporary work arrays that will be used in
183    * calculating the permutation vector and groupings. */
184   PetscCall(PetscMalloc1(m, &nz_in_row));
185 
186   /* Now actually figure out the permutation and grouping. */
187 
188   /* First pass: Determine number of nonzeros in each row, maximum
189    * number of nonzeros in any row, and how many rows fall into each
190    * "bucket" of rows with same number of nonzeros. */
191   maxnz = 0;
192   for (i = 0; i < m; i++) {
193     nz_in_row[i] = ia[i + 1] - ia[i];
194     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
195   }
196   PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &rows_in_bucket));
197   PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &ipnz));
198 
199   for (i = 0; i <= maxnz; i++) rows_in_bucket[i] = 0;
200   for (i = 0; i < m; i++) {
201     nz = nz_in_row[i];
202     rows_in_bucket[nz]++;
203   }
204 
205   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
206    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
207    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
208    * then xgroup[] must consist of (maxnz + 2) elements, since the last
209    * element of xgroup will tell us where the (maxnz + 1)th group ends.
210    * We allocate space for the maximum number of groups;
211    * that is potentially a little wasteful, but not too much so.
212    * Perhaps I should fix it later. */
213   PetscCall(PetscMalloc1(maxnz + 2, &aijperm->xgroup));
214   PetscCall(PetscMalloc1(maxnz + 1, &aijperm->nzgroup));
215 
216   /* Second pass.  Look at what is in the buckets and create the groupings.
217    * Note that it is OK to have a group of rows with no non-zero values. */
218   ngroup = 0;
219   istart = 0;
220   for (i = 0; i <= maxnz; i++) {
221     if (rows_in_bucket[i] > 0) {
222       aijperm->nzgroup[ngroup] = i;
223       aijperm->xgroup[ngroup]  = istart;
224       ngroup++;
225       istart += rows_in_bucket[i];
226     }
227   }
228 
229   aijperm->xgroup[ngroup] = istart;
230   aijperm->ngroup         = ngroup;
231 
232   /* Now fill in the permutation vector iperm. */
233   ipnz[0] = 0;
234   for (i = 0; i < maxnz; i++) ipnz[i + 1] = ipnz[i] + rows_in_bucket[i];
235 
236   for (i = 0; i < m; i++) {
237     nz                   = nz_in_row[i];
238     ipos                 = ipnz[nz];
239     aijperm->iperm[ipos] = i;
240     ipnz[nz]++;
241   }
242 
243   /* Clean up temporary work arrays. */
244   PetscCall(PetscFree(rows_in_bucket));
245   PetscCall(PetscFree(ipnz));
246   PetscCall(PetscFree(nz_in_row));
247   PetscFunctionReturn(0);
248 }
249 
250 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode) {
251   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
252 
253   PetscFunctionBegin;
254   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
255 
256   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
257    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
258    * I'm not sure if this is the best way to do this, but it avoids
259    * a lot of code duplication.
260    * I also note that currently MATSEQAIJPERM doesn't know anything about
261    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
262    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
263    * this, this may break things.  (Don't know... haven't looked at it.) */
264   a->inode.use = PETSC_FALSE;
265   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
266 
267   /* Now calculate the permutation and grouping information. */
268   PetscCall(MatSeqAIJPERM_create_perm(A));
269   PetscFunctionReturn(0);
270 }
271 
272 PetscErrorCode MatMult_SeqAIJPERM(Mat A, Vec xx, Vec yy) {
273   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
274   const PetscScalar *x;
275   PetscScalar       *y;
276   const MatScalar   *aa;
277   const PetscInt    *aj, *ai;
278 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
279   PetscInt i, j;
280 #endif
281 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
282   __m512d  vec_x, vec_y, vec_vals;
283   __m256i  vec_idx, vec_ipos, vec_j;
284   __mmask8 mask;
285 #endif
286 
287   /* Variables that don't appear in MatMult_SeqAIJ. */
288   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
289   PetscInt       *iperm; /* Points to the permutation vector. */
290   PetscInt       *xgroup;
291   /* Denotes where groups of rows with same number of nonzeros
292    * begin and end in iperm. */
293   PetscInt       *nzgroup;
294   PetscInt        ngroup;
295   PetscInt        igroup;
296   PetscInt        jstart, jend;
297   /* jstart is used in loops to denote the position in iperm where a
298    * group starts; jend denotes the position where it ends.
299    * (jend + 1 is where the next group starts.) */
300   PetscInt        iold, nz;
301   PetscInt        istart, iend, isize;
302   PetscInt        ipos;
303   PetscScalar     yp[NDIM];
304   PetscInt        ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
305 
306 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
307 #pragma disjoint(*x, *y, *aa)
308 #endif
309 
310   PetscFunctionBegin;
311   PetscCall(VecGetArrayRead(xx, &x));
312   PetscCall(VecGetArray(yy, &y));
313   aj = a->j; /* aj[k] gives column index for element aa[k]. */
314   aa = a->a; /* Nonzero elements stored row-by-row. */
315   ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
316 
317   /* Get the info we need about the permutations and groupings. */
318   iperm   = aijperm->iperm;
319   ngroup  = aijperm->ngroup;
320   xgroup  = aijperm->xgroup;
321   nzgroup = aijperm->nzgroup;
322 
323 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
324   fortranmultaijperm_(&m, x, ii, aj, aa, y);
325 #else
326 
327   for (igroup = 0; igroup < ngroup; igroup++) {
328     jstart = xgroup[igroup];
329     jend   = xgroup[igroup + 1] - 1;
330     nz     = nzgroup[igroup];
331 
332     /* Handle the special cases where the number of nonzeros per row
333      * in the group is either 0 or 1. */
334     if (nz == 0) {
335       for (i = jstart; i <= jend; i++) y[iperm[i]] = 0.0;
336     } else if (nz == 1) {
337       for (i = jstart; i <= jend; i++) {
338         iold    = iperm[i];
339         ipos    = ai[iold];
340         y[iold] = aa[ipos] * x[aj[ipos]];
341       }
342     } else {
343       /* We work our way through the current group in chunks of NDIM rows
344        * at a time. */
345 
346       for (istart = jstart; istart <= jend; istart += NDIM) {
347         /* Figure out where the chunk of 'isize' rows ends in iperm.
348          * 'isize may of course be less than NDIM for the last chunk. */
349         iend = istart + (NDIM - 1);
350 
351         if (iend > jend) iend = jend;
352 
353         isize = iend - istart + 1;
354 
355         /* Initialize the yp[] array that will be used to hold part of
356          * the permuted results vector, and figure out where in aa each
357          * row of the chunk will begin. */
358         for (i = 0; i < isize; i++) {
359           iold  = iperm[istart + i];
360           /* iold is a row number from the matrix A *before* reordering. */
361           ip[i] = ai[iold];
362           /* ip[i] tells us where the ith row of the chunk begins in aa. */
363           yp[i] = (PetscScalar)0.0;
364         }
365 
366         /* If the number of zeros per row exceeds the number of rows in
367          * the chunk, we should vectorize along nz, that is, perform the
368          * mat-vec one row at a time as in the usual CSR case. */
369         if (nz > isize) {
370 #if defined(PETSC_HAVE_CRAY_VECTOR)
371 #pragma _CRI preferstream
372 #endif
373           for (i = 0; i < isize; i++) {
374 #if defined(PETSC_HAVE_CRAY_VECTOR)
375 #pragma _CRI prefervector
376 #endif
377 
378 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
379             vec_y = _mm512_setzero_pd();
380             ipos  = ip[i];
381             for (j = 0; j < (nz >> 3); j++) {
382               vec_idx  = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
383               vec_vals = _mm512_loadu_pd(&aa[ipos]);
384               vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
385               vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
386               ipos += 8;
387             }
388             if ((nz & 0x07) > 2) {
389               mask     = (__mmask8)(0xff >> (8 - (nz & 0x07)));
390               vec_idx  = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
391               vec_vals = _mm512_loadu_pd(&aa[ipos]);
392               vec_x    = _mm512_mask_i32gather_pd(vec_x, mask, vec_idx, x, _MM_SCALE_8);
393               vec_y    = _mm512_mask3_fmadd_pd(vec_x, vec_vals, vec_y, mask);
394             } else if ((nz & 0x07) == 2) {
395               yp[i] += aa[ipos] * x[aj[ipos]];
396               yp[i] += aa[ipos + 1] * x[aj[ipos + 1]];
397             } else if ((nz & 0x07) == 1) {
398               yp[i] += aa[ipos] * x[aj[ipos]];
399             }
400             yp[i] += _mm512_reduce_add_pd(vec_y);
401 #else
402             for (j = 0; j < nz; j++) {
403               ipos = ip[i] + j;
404               yp[i] += aa[ipos] * x[aj[ipos]];
405             }
406 #endif
407           }
408         } else {
409           /* Otherwise, there are enough rows in the chunk to make it
410            * worthwhile to vectorize across the rows, that is, to do the
411            * matvec by operating with "columns" of the chunk. */
412           for (j = 0; j < nz; j++) {
413 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
414             vec_j = _mm256_set1_epi32(j);
415             for (i = 0; i < ((isize >> 3) << 3); i += 8) {
416               vec_y    = _mm512_loadu_pd(&yp[i]);
417               vec_ipos = _mm256_loadu_si256((__m256i const *)&ip[i]);
418               vec_ipos = _mm256_add_epi32(vec_ipos, vec_j);
419               vec_idx  = _mm256_i32gather_epi32(aj, vec_ipos, _MM_SCALE_4);
420               vec_vals = _mm512_i32gather_pd(vec_ipos, aa, _MM_SCALE_8);
421               vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
422               vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
423               _mm512_storeu_pd(&yp[i], vec_y);
424             }
425             for (i = isize - (isize & 0x07); i < isize; i++) {
426               ipos = ip[i] + j;
427               yp[i] += aa[ipos] * x[aj[ipos]];
428             }
429 #else
430             for (i = 0; i < isize; i++) {
431               ipos = ip[i] + j;
432               yp[i] += aa[ipos] * x[aj[ipos]];
433             }
434 #endif
435           }
436         }
437 
438 #if defined(PETSC_HAVE_CRAY_VECTOR)
439 #pragma _CRI ivdep
440 #endif
441         /* Put results from yp[] into non-permuted result vector y. */
442         for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
443       } /* End processing chunk of isize rows of a group. */
444     }   /* End handling matvec for chunk with nz > 1. */
445   }     /* End loop over igroup. */
446 #endif
447   PetscCall(PetscLogFlops(PetscMax(2.0 * a->nz - A->rmap->n, 0)));
448   PetscCall(VecRestoreArrayRead(xx, &x));
449   PetscCall(VecRestoreArray(yy, &y));
450   PetscFunctionReturn(0);
451 }
452 
453 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
454  * Note that the names I used to designate the vectors differs from that
455  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
456  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
457 /*
458     I hate having virtually identical code for the mult and the multadd!!!
459 */
460 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A, Vec xx, Vec ww, Vec yy) {
461   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
462   const PetscScalar *x;
463   PetscScalar       *y, *w;
464   const MatScalar   *aa;
465   const PetscInt    *aj, *ai;
466 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
467   PetscInt i, j;
468 #endif
469 
470   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
471   Mat_SeqAIJPERM *aijperm;
472   PetscInt       *iperm; /* Points to the permutation vector. */
473   PetscInt       *xgroup;
474   /* Denotes where groups of rows with same number of nonzeros
475    * begin and end in iperm. */
476   PetscInt       *nzgroup;
477   PetscInt        ngroup;
478   PetscInt        igroup;
479   PetscInt        jstart, jend;
480   /* jstart is used in loops to denote the position in iperm where a
481    * group starts; jend denotes the position where it ends.
482    * (jend + 1 is where the next group starts.) */
483   PetscInt        iold, nz;
484   PetscInt        istart, iend, isize;
485   PetscInt        ipos;
486   PetscScalar     yp[NDIM];
487   PetscInt        ip[NDIM];
488   /* yp[] and ip[] are treated as vector "registers" for performing
489    * the mat-vec. */
490 
491 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
492 #pragma disjoint(*x, *y, *aa)
493 #endif
494 
495   PetscFunctionBegin;
496   PetscCall(VecGetArrayRead(xx, &x));
497   PetscCall(VecGetArrayPair(yy, ww, &y, &w));
498 
499   aj = a->j; /* aj[k] gives column index for element aa[k]. */
500   aa = a->a; /* Nonzero elements stored row-by-row. */
501   ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
502 
503   /* Get the info we need about the permutations and groupings. */
504   aijperm = (Mat_SeqAIJPERM *)A->spptr;
505   iperm   = aijperm->iperm;
506   ngroup  = aijperm->ngroup;
507   xgroup  = aijperm->xgroup;
508   nzgroup = aijperm->nzgroup;
509 
510 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
511   fortranmultaddaijperm_(&m, x, ii, aj, aa, y, w);
512 #else
513 
514   for (igroup = 0; igroup < ngroup; igroup++) {
515     jstart = xgroup[igroup];
516     jend   = xgroup[igroup + 1] - 1;
517 
518     nz = nzgroup[igroup];
519 
520     /* Handle the special cases where the number of nonzeros per row
521      * in the group is either 0 or 1. */
522     if (nz == 0) {
523       for (i = jstart; i <= jend; i++) {
524         iold    = iperm[i];
525         y[iold] = w[iold];
526       }
527     } else if (nz == 1) {
528       for (i = jstart; i <= jend; i++) {
529         iold    = iperm[i];
530         ipos    = ai[iold];
531         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
532       }
533     }
534     /* For the general case: */
535     else {
536       /* We work our way through the current group in chunks of NDIM rows
537        * at a time. */
538 
539       for (istart = jstart; istart <= jend; istart += NDIM) {
540         /* Figure out where the chunk of 'isize' rows ends in iperm.
541          * 'isize may of course be less than NDIM for the last chunk. */
542         iend = istart + (NDIM - 1);
543         if (iend > jend) iend = jend;
544         isize = iend - istart + 1;
545 
546         /* Initialize the yp[] array that will be used to hold part of
547          * the permuted results vector, and figure out where in aa each
548          * row of the chunk will begin. */
549         for (i = 0; i < isize; i++) {
550           iold  = iperm[istart + i];
551           /* iold is a row number from the matrix A *before* reordering. */
552           ip[i] = ai[iold];
553           /* ip[i] tells us where the ith row of the chunk begins in aa. */
554           yp[i] = w[iold];
555         }
556 
557         /* If the number of zeros per row exceeds the number of rows in
558          * the chunk, we should vectorize along nz, that is, perform the
559          * mat-vec one row at a time as in the usual CSR case. */
560         if (nz > isize) {
561 #if defined(PETSC_HAVE_CRAY_VECTOR)
562 #pragma _CRI preferstream
563 #endif
564           for (i = 0; i < isize; i++) {
565 #if defined(PETSC_HAVE_CRAY_VECTOR)
566 #pragma _CRI prefervector
567 #endif
568             for (j = 0; j < nz; j++) {
569               ipos = ip[i] + j;
570               yp[i] += aa[ipos] * x[aj[ipos]];
571             }
572           }
573         }
574         /* Otherwise, there are enough rows in the chunk to make it
575          * worthwhile to vectorize across the rows, that is, to do the
576          * matvec by operating with "columns" of the chunk. */
577         else {
578           for (j = 0; j < nz; j++) {
579             for (i = 0; i < isize; i++) {
580               ipos = ip[i] + j;
581               yp[i] += aa[ipos] * x[aj[ipos]];
582             }
583           }
584         }
585 
586 #if defined(PETSC_HAVE_CRAY_VECTOR)
587 #pragma _CRI ivdep
588 #endif
589         /* Put results from yp[] into non-permuted result vector y. */
590         for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
591       } /* End processing chunk of isize rows of a group. */
592 
593     } /* End handling matvec for chunk with nz > 1. */
594   }   /* End loop over igroup. */
595 
596 #endif
597   PetscCall(PetscLogFlops(2.0 * a->nz));
598   PetscCall(VecRestoreArrayRead(xx, &x));
599   PetscCall(VecRestoreArrayPair(yy, ww, &y, &w));
600   PetscFunctionReturn(0);
601 }
602 
603 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
604  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
605  * routine, but can also be used to convert an assembled SeqAIJ matrix
606  * into a SeqAIJPERM one. */
607 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A, MatType type, MatReuse reuse, Mat *newmat) {
608   Mat             B = *newmat;
609   Mat_SeqAIJPERM *aijperm;
610   PetscBool       sametype;
611 
612   PetscFunctionBegin;
613   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
614   PetscCall(PetscObjectTypeCompare((PetscObject)A, type, &sametype));
615   if (sametype) PetscFunctionReturn(0);
616 
617   PetscCall(PetscNewLog(B, &aijperm));
618   B->spptr = (void *)aijperm;
619 
620   /* Set function pointers for methods that we inherit from AIJ but override. */
621   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
622   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
623   B->ops->destroy     = MatDestroy_SeqAIJPERM;
624   B->ops->mult        = MatMult_SeqAIJPERM;
625   B->ops->multadd     = MatMultAdd_SeqAIJPERM;
626 
627   aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
628   /* If A has already been assembled, compute the permutation. */
629   if (A->assembled) PetscCall(MatSeqAIJPERM_create_perm(B));
630 
631   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
632 
633   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJPERM));
634   *newmat = B;
635   PetscFunctionReturn(0);
636 }
637 
638 /*@C
639    MatCreateSeqAIJPERM - Creates a sparse matrix of type `MATSEQAIJPERM`.
640    This type inherits from `MATSEQAIJ`, but calculates some additional permutation
641    information that is used to allow better vectorization of some
642    operations.  At the cost of increased storage, the `MATSEQAIJ` formatted
643    matrix can be copied to a format in which pieces of the matrix are
644    stored in ELLPACK format, allowing the vectorized matrix multiply
645    routine to use stride-1 memory accesses.  As with the `MATSEQAIJ` type, it is
646    important to preallocate matrix storage in order to get good assembly
647    performance.
648 
649    Collective
650 
651    Input Parameters:
652 +  comm - MPI communicator, set to `PETSC_COMM_SELF`
653 .  m - number of rows
654 .  n - number of columns
655 .  nz - number of nonzeros per row (same for all rows)
656 -  nnz - array containing the number of nonzeros in the various rows
657          (possibly different for each row) or NULL
658 
659    Output Parameter:
660 .  A - the matrix
661 
662    Note:
663    If nnz is given then nz is ignored
664 
665    Level: intermediate
666 
667 .seealso: `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()`
668 @*/
669 PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) {
670   PetscFunctionBegin;
671   PetscCall(MatCreate(comm, A));
672   PetscCall(MatSetSizes(*A, m, n, m, n));
673   PetscCall(MatSetType(*A, MATSEQAIJPERM));
674   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, nnz));
675   PetscFunctionReturn(0);
676 }
677 
678 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A) {
679   PetscFunctionBegin;
680   PetscCall(MatSetType(A, MATSEQAIJ));
681   PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A, MATSEQAIJPERM, MAT_INPLACE_MATRIX, &A));
682   PetscFunctionReturn(0);
683 }
684