xref: /petsc/src/mat/impls/aij/seq/aijperm/aijperm.c (revision 00a402e09c6d62f19dbf72eef0e53394ab690d5c)
1 
2 /*
3   Defines basic operations for the MATSEQAIJPERM matrix class.
4   This class is derived from the MATSEQAIJ class and retains the
5   compressed row storage (aka Yale sparse matrix format) but augments
6   it with some permutation information that enables some operations
7   to be more vectorizable.  A physically rearranged copy of the matrix
8   may be stored if the user desires.
9 
10   Eventually a variety of permutations may be supported.
11 */
12 
13 #include <../src/mat/impls/aij/seq/aij.h>
14 
15 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16 #include <immintrin.h>
17 
18 #if !defined(_MM_SCALE_8)
19 #define _MM_SCALE_8    8
20 #endif
21 #if !defined(_MM_SCALE_4)
22 #define _MM_SCALE_4    4
23 #endif
24 #endif
25 
26 #define NDIM 512
27 /* NDIM specifies how many rows at a time we should work with when
28  * performing the vectorized mat-vec.  This depends on various factors
29  * such as vector register length, etc., and I really need to add a
30  * way for the user (or the library) to tune this.  I'm setting it to
31  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32  * routines. */
33 
34 typedef struct {
35   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
36 
37   PetscInt         ngroup;
38   PetscInt         *xgroup;
39   /* Denotes where groups of rows with same number of nonzeros
40    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41    * where the ith group begins. */
42 
43   PetscInt         *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
44   PetscInt         *iperm;  /* The permutation vector. */
45 
46   /* Some of this stuff is for Ed's recursive triangular solve.
47    * I'm not sure what I need yet. */
48   PetscInt         blocksize;
49   PetscInt         nstep;
50   PetscInt         *jstart_list;
51   PetscInt         *jend_list;
52   PetscInt         *action_list;
53   PetscInt         *ngroup_list;
54   PetscInt         **ipointer_list;
55   PetscInt         **xgroup_list;
56   PetscInt         **nzgroup_list;
57   PetscInt         **iperm_list;
58 } Mat_SeqAIJPERM;
59 
60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
61 {
62   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
63   /* so we will ignore 'MatType type'. */
64   PetscErrorCode ierr;
65   Mat            B       = *newmat;
66   Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr;
67 
68   PetscFunctionBegin;
69   if (reuse == MAT_INITIAL_MATRIX) {
70     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
71     aijperm=(Mat_SeqAIJPERM*)B->spptr;
72   }
73 
74   /* Reset the original function pointers. */
75   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
76   B->ops->destroy     = MatDestroy_SeqAIJ;
77   B->ops->duplicate   = MatDuplicate_SeqAIJ;
78   B->ops->mult        = MatMult_SeqAIJ;
79   B->ops->multadd     = MatMultAdd_SeqAIJ;
80 
81   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);CHKERRQ(ierr);
82   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr);
83   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr);
84   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr);
85 
86   /* Free everything in the Mat_SeqAIJPERM data structure.*/
87   ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
88   ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
89   ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
90   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
91 
92   /* Change the type of B to MATSEQAIJ. */
93   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
94 
95   *newmat = B;
96   PetscFunctionReturn(0);
97 }
98 
99 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
100 {
101   PetscErrorCode ierr;
102   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
103 
104   PetscFunctionBegin;
105   if (aijperm) {
106     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
107     ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
108     ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
109     ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
110     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
111   }
112   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
113    * to destroy everything that remains. */
114   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
115   /* Note that I don't call MatSetType().  I believe this is because that
116    * is only to be called when *building* a matrix.  I could be wrong, but
117    * that is how things work for the SuperLU matrix class. */
118   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
123 {
124   PetscErrorCode ierr;
125   Mat_SeqAIJPERM *aijperm      = (Mat_SeqAIJPERM*) A->spptr;
126   Mat_SeqAIJPERM *aijperm_dest;
127   PetscBool      perm;
128 
129   PetscFunctionBegin;
130   ierr        = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
131   ierr        = PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);CHKERRQ(ierr);
132   if (perm) {
133     aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr;
134     ierr = PetscFree(aijperm_dest->xgroup);CHKERRQ(ierr);
135     ierr = PetscFree(aijperm_dest->nzgroup);CHKERRQ(ierr);
136     ierr = PetscFree(aijperm_dest->iperm);CHKERRQ(ierr);
137   } else {
138     ierr        = PetscNewLog(*M,&aijperm_dest);CHKERRQ(ierr);
139     (*M)->spptr = (void*) aijperm_dest;
140     ierr        = PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);CHKERRQ(ierr);
141     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr);
142     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
143     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
144     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
145   }
146   ierr        = PetscMemcpy(aijperm_dest,aijperm,sizeof(Mat_SeqAIJPERM));CHKERRQ(ierr);
147   /* Allocate space for, and copy the grouping and permutation info.
148    * I note that when the groups are initially determined in
149    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
150    * necessary.  But at this point, we know how large they need to be, and
151    * allocate only the necessary amount of memory.  So the duplicated matrix
152    * may actually use slightly less storage than the original! */
153   ierr = PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);CHKERRQ(ierr);
154   ierr = PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);CHKERRQ(ierr);
155   ierr = PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);CHKERRQ(ierr);
156   ierr = PetscMemcpy(aijperm_dest->iperm,aijperm->iperm,sizeof(PetscInt)*A->rmap->n);CHKERRQ(ierr);
157   ierr = PetscMemcpy(aijperm_dest->xgroup,aijperm->xgroup,sizeof(PetscInt)*(aijperm->ngroup+1));CHKERRQ(ierr);
158   ierr = PetscMemcpy(aijperm_dest->nzgroup,aijperm->nzgroup,sizeof(PetscInt)*aijperm->ngroup);CHKERRQ(ierr);
159   PetscFunctionReturn(0);
160 }
161 
162 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
163 {
164   PetscErrorCode ierr;
165   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)(A)->data;
166   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
167   PetscInt       m;       /* Number of rows in the matrix. */
168   PetscInt       *ia;       /* From the CSR representation; points to the beginning  of each row. */
169   PetscInt       maxnz;      /* Maximum number of nonzeros in any row. */
170   PetscInt       *rows_in_bucket;
171   /* To construct the permutation, we sort each row into one of maxnz
172    * buckets based on how many nonzeros are in the row. */
173   PetscInt       nz;
174   PetscInt       *nz_in_row;         /* the number of nonzero elements in row k. */
175   PetscInt       *ipnz;
176   /* When constructing the iperm permutation vector,
177    * ipnz[nz] is used to point to the next place in the permutation vector
178    * that a row with nz nonzero elements should be placed.*/
179   PetscInt       i, ngroup, istart, ipos;
180 
181   PetscFunctionBegin;
182   if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */
183   aijperm->nonzerostate = A->nonzerostate;
184  /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
185   ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
186   ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
187   ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
188 
189   m  = A->rmap->n;
190   ia = a->i;
191 
192   /* Allocate the arrays that will hold the permutation vector. */
193   ierr = PetscMalloc1(m, &aijperm->iperm);CHKERRQ(ierr);
194 
195   /* Allocate some temporary work arrays that will be used in
196    * calculating the permuation vector and groupings. */
197   ierr = PetscMalloc1(m, &nz_in_row);CHKERRQ(ierr);
198 
199   /* Now actually figure out the permutation and grouping. */
200 
201   /* First pass: Determine number of nonzeros in each row, maximum
202    * number of nonzeros in any row, and how many rows fall into each
203    * "bucket" of rows with same number of nonzeros. */
204   maxnz = 0;
205   for (i=0; i<m; i++) {
206     nz_in_row[i] = ia[i+1]-ia[i];
207     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
208   }
209   ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);CHKERRQ(ierr);
210   ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);CHKERRQ(ierr);
211 
212   for (i=0; i<=maxnz; i++) {
213     rows_in_bucket[i] = 0;
214   }
215   for (i=0; i<m; i++) {
216     nz = nz_in_row[i];
217     rows_in_bucket[nz]++;
218   }
219 
220   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
221    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
222    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
223    * then xgroup[] must consist of (maxnz + 2) elements, since the last
224    * element of xgroup will tell us where the (maxnz + 1)th group ends.
225    * We allocate space for the maximum number of groups;
226    * that is potentially a little wasteful, but not too much so.
227    * Perhaps I should fix it later. */
228   ierr = PetscMalloc1(maxnz+2, &aijperm->xgroup);CHKERRQ(ierr);
229   ierr = PetscMalloc1(maxnz+1, &aijperm->nzgroup);CHKERRQ(ierr);
230 
231   /* Second pass.  Look at what is in the buckets and create the groupings.
232    * Note that it is OK to have a group of rows with no non-zero values. */
233   ngroup = 0;
234   istart = 0;
235   for (i=0; i<=maxnz; i++) {
236     if (rows_in_bucket[i] > 0) {
237       aijperm->nzgroup[ngroup] = i;
238       aijperm->xgroup[ngroup]  = istart;
239       ngroup++;
240       istart += rows_in_bucket[i];
241     }
242   }
243 
244   aijperm->xgroup[ngroup] = istart;
245   aijperm->ngroup         = ngroup;
246 
247   /* Now fill in the permutation vector iperm. */
248   ipnz[0] = 0;
249   for (i=0; i<maxnz; i++) {
250     ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
251   }
252 
253   for (i=0; i<m; i++) {
254     nz                   = nz_in_row[i];
255     ipos                 = ipnz[nz];
256     aijperm->iperm[ipos] = i;
257     ipnz[nz]++;
258   }
259 
260   /* Clean up temporary work arrays. */
261   ierr = PetscFree(rows_in_bucket);CHKERRQ(ierr);
262   ierr = PetscFree(ipnz);CHKERRQ(ierr);
263   ierr = PetscFree(nz_in_row);CHKERRQ(ierr);
264   PetscFunctionReturn(0);
265 }
266 
267 
268 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
269 {
270   PetscErrorCode ierr;
271   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
272 
273   PetscFunctionBegin;
274   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
275 
276   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
277    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
278    * I'm not sure if this is the best way to do this, but it avoids
279    * a lot of code duplication.
280    * I also note that currently MATSEQAIJPERM doesn't know anything about
281    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
282    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
283    * this, this may break things.  (Don't know... haven't looked at it.) */
284   a->inode.use = PETSC_FALSE;
285   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
286 
287   /* Now calculate the permutation and grouping information. */
288   ierr = MatSeqAIJPERM_create_perm(A);CHKERRQ(ierr);
289   PetscFunctionReturn(0);
290 }
291 
292 PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)
293 {
294   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
295   const PetscScalar *x;
296   PetscScalar       *y;
297   const MatScalar   *aa;
298   PetscErrorCode    ierr;
299   const PetscInt    *aj,*ai;
300 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
301   PetscInt          i,j;
302 #endif
303 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
304   __m512d           vec_x,vec_y,vec_vals;
305   __m256i           vec_idx,vec_ipos,vec_j;
306   __mmask8           mask;
307 #endif
308 
309   /* Variables that don't appear in MatMult_SeqAIJ. */
310   Mat_SeqAIJPERM    *aijperm = (Mat_SeqAIJPERM*) A->spptr;
311   PetscInt          *iperm;  /* Points to the permutation vector. */
312   PetscInt          *xgroup;
313   /* Denotes where groups of rows with same number of nonzeros
314    * begin and end in iperm. */
315   PetscInt          *nzgroup;
316   PetscInt          ngroup;
317   PetscInt          igroup;
318   PetscInt          jstart,jend;
319   /* jstart is used in loops to denote the position in iperm where a
320    * group starts; jend denotes the position where it ends.
321    * (jend + 1 is where the next group starts.) */
322   PetscInt          iold,nz;
323   PetscInt          istart,iend,isize;
324   PetscInt          ipos;
325   PetscScalar       yp[NDIM];
326   PetscInt          ip[NDIM];    /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
327 
328 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
329 #pragma disjoint(*x,*y,*aa)
330 #endif
331 
332   PetscFunctionBegin;
333   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
334   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
335   aj   = a->j;   /* aj[k] gives column index for element aa[k]. */
336   aa   = a->a; /* Nonzero elements stored row-by-row. */
337   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
338 
339   /* Get the info we need about the permutations and groupings. */
340   iperm   = aijperm->iperm;
341   ngroup  = aijperm->ngroup;
342   xgroup  = aijperm->xgroup;
343   nzgroup = aijperm->nzgroup;
344 
345 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
346   fortranmultaijperm_(&m,x,ii,aj,aa,y);
347 #else
348 
349   for (igroup=0; igroup<ngroup; igroup++) {
350     jstart = xgroup[igroup];
351     jend   = xgroup[igroup+1] - 1;
352     nz     = nzgroup[igroup];
353 
354     /* Handle the special cases where the number of nonzeros per row
355      * in the group is either 0 or 1. */
356     if (nz == 0) {
357       for (i=jstart; i<=jend; i++) {
358         y[iperm[i]] = 0.0;
359       }
360     } else if (nz == 1) {
361       for (i=jstart; i<=jend; i++) {
362         iold    = iperm[i];
363         ipos    = ai[iold];
364         y[iold] = aa[ipos] * x[aj[ipos]];
365       }
366     } else {
367 
368       /* We work our way through the current group in chunks of NDIM rows
369        * at a time. */
370 
371       for (istart=jstart; istart<=jend; istart+=NDIM) {
372         /* Figure out where the chunk of 'isize' rows ends in iperm.
373          * 'isize may of course be less than NDIM for the last chunk. */
374         iend = istart + (NDIM - 1);
375 
376         if (iend > jend) iend = jend;
377 
378         isize = iend - istart + 1;
379 
380         /* Initialize the yp[] array that will be used to hold part of
381          * the permuted results vector, and figure out where in aa each
382          * row of the chunk will begin. */
383         for (i=0; i<isize; i++) {
384           iold = iperm[istart + i];
385           /* iold is a row number from the matrix A *before* reordering. */
386           ip[i] = ai[iold];
387           /* ip[i] tells us where the ith row of the chunk begins in aa. */
388           yp[i] = (PetscScalar) 0.0;
389         }
390 
391         /* If the number of zeros per row exceeds the number of rows in
392          * the chunk, we should vectorize along nz, that is, perform the
393          * mat-vec one row at a time as in the usual CSR case. */
394         if (nz > isize) {
395 #if defined(PETSC_HAVE_CRAY_VECTOR)
396 #pragma _CRI preferstream
397 #endif
398           for (i=0; i<isize; i++) {
399 #if defined(PETSC_HAVE_CRAY_VECTOR)
400 #pragma _CRI prefervector
401 #endif
402 
403 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
404             vec_y = _mm512_setzero_pd();
405             ipos = ip[i];
406             for (j=0; j<(nz>>3); j++) {
407               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
408               vec_vals = _mm512_loadu_pd(&aa[ipos]);
409               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
410               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
411               ipos += 8;
412             }
413             if ((nz&0x07)>2) {
414               mask     = (__mmask8)(0xff >> (8-(nz&0x07)));
415               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
416               vec_vals = _mm512_loadu_pd(&aa[ipos]);
417               vec_x    = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8);
418               vec_y    = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask);
419             } else if ((nz&0x07)==2) {
420               yp[i] += aa[ipos]*x[aj[ipos]];
421               yp[i] += aa[ipos+1]*x[aj[ipos+1]];
422             } else if ((nz&0x07)==1) {
423               yp[i] += aa[ipos]*x[aj[ipos]];
424             }
425             yp[i] += _mm512_reduce_add_pd(vec_y);
426 #else
427             for (j=0; j<nz; j++) {
428               ipos   = ip[i] + j;
429               yp[i] += aa[ipos] * x[aj[ipos]];
430             }
431 #endif
432           }
433         } else {
434           /* Otherwise, there are enough rows in the chunk to make it
435            * worthwhile to vectorize across the rows, that is, to do the
436            * matvec by operating with "columns" of the chunk. */
437           for (j=0; j<nz; j++) {
438 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
439             vec_j = _mm256_set1_epi32(j);
440             for (i=0; i<((isize>>3)<<3); i+=8) {
441               vec_y    = _mm512_loadu_pd(&yp[i]);
442               vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]);
443               vec_ipos = _mm256_add_epi32(vec_ipos,vec_j);
444               vec_idx  = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4);
445               vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8);
446               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
447               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
448               _mm512_storeu_pd(&yp[i],vec_y);
449             }
450             for (i=isize-(isize&0x07); i<isize; i++) {
451               ipos = ip[i]+j;
452               yp[i] += aa[ipos]*x[aj[ipos]];
453             }
454 #else
455             for (i=0; i<isize; i++) {
456               ipos   = ip[i] + j;
457               yp[i] += aa[ipos] * x[aj[ipos]];
458             }
459 #endif
460           }
461         }
462 
463 #if defined(PETSC_HAVE_CRAY_VECTOR)
464 #pragma _CRI ivdep
465 #endif
466         /* Put results from yp[] into non-permuted result vector y. */
467         for (i=0; i<isize; i++) {
468           y[iperm[istart+i]] = yp[i];
469         }
470       } /* End processing chunk of isize rows of a group. */
471     } /* End handling matvec for chunk with nz > 1. */
472   } /* End loop over igroup. */
473 #endif
474   ierr = PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));CHKERRQ(ierr);
475   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
476   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
477   PetscFunctionReturn(0);
478 }
479 
480 
481 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
482  * Note that the names I used to designate the vectors differs from that
483  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
484  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
485 /*
486     I hate having virtually identical code for the mult and the multadd!!!
487 */
488 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)
489 {
490   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
491   const PetscScalar *x;
492   PetscScalar       *y,*w;
493   const MatScalar   *aa;
494   PetscErrorCode    ierr;
495   const PetscInt    *aj,*ai;
496 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
497   PetscInt i,j;
498 #endif
499 
500   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
501   Mat_SeqAIJPERM * aijperm;
502   PetscInt       *iperm;    /* Points to the permutation vector. */
503   PetscInt       *xgroup;
504   /* Denotes where groups of rows with same number of nonzeros
505    * begin and end in iperm. */
506   PetscInt *nzgroup;
507   PetscInt ngroup;
508   PetscInt igroup;
509   PetscInt jstart,jend;
510   /* jstart is used in loops to denote the position in iperm where a
511    * group starts; jend denotes the position where it ends.
512    * (jend + 1 is where the next group starts.) */
513   PetscInt    iold,nz;
514   PetscInt    istart,iend,isize;
515   PetscInt    ipos;
516   PetscScalar yp[NDIM];
517   PetscInt    ip[NDIM];
518   /* yp[] and ip[] are treated as vector "registers" for performing
519    * the mat-vec. */
520 
521 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
522 #pragma disjoint(*x,*y,*aa)
523 #endif
524 
525   PetscFunctionBegin;
526   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
527   ierr = VecGetArrayPair(yy,ww,&y,&w);CHKERRQ(ierr);
528 
529   aj = a->j;   /* aj[k] gives column index for element aa[k]. */
530   aa = a->a;   /* Nonzero elements stored row-by-row. */
531   ai = a->i;   /* ai[k] is the position in aa and aj where row k starts. */
532 
533   /* Get the info we need about the permutations and groupings. */
534   aijperm = (Mat_SeqAIJPERM*) A->spptr;
535   iperm   = aijperm->iperm;
536   ngroup  = aijperm->ngroup;
537   xgroup  = aijperm->xgroup;
538   nzgroup = aijperm->nzgroup;
539 
540 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
541   fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w);
542 #else
543 
544   for (igroup=0; igroup<ngroup; igroup++) {
545     jstart = xgroup[igroup];
546     jend   = xgroup[igroup+1] - 1;
547 
548     nz = nzgroup[igroup];
549 
550     /* Handle the special cases where the number of nonzeros per row
551      * in the group is either 0 or 1. */
552     if (nz == 0) {
553       for (i=jstart; i<=jend; i++) {
554         iold    = iperm[i];
555         y[iold] = w[iold];
556       }
557     }
558     else if (nz == 1) {
559       for (i=jstart; i<=jend; i++) {
560         iold    = iperm[i];
561         ipos    = ai[iold];
562         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
563       }
564     }
565     /* For the general case: */
566     else {
567 
568       /* We work our way through the current group in chunks of NDIM rows
569        * at a time. */
570 
571       for (istart=jstart; istart<=jend; istart+=NDIM) {
572         /* Figure out where the chunk of 'isize' rows ends in iperm.
573          * 'isize may of course be less than NDIM for the last chunk. */
574         iend = istart + (NDIM - 1);
575         if (iend > jend) iend = jend;
576         isize = iend - istart + 1;
577 
578         /* Initialize the yp[] array that will be used to hold part of
579          * the permuted results vector, and figure out where in aa each
580          * row of the chunk will begin. */
581         for (i=0; i<isize; i++) {
582           iold = iperm[istart + i];
583           /* iold is a row number from the matrix A *before* reordering. */
584           ip[i] = ai[iold];
585           /* ip[i] tells us where the ith row of the chunk begins in aa. */
586           yp[i] = w[iold];
587         }
588 
589         /* If the number of zeros per row exceeds the number of rows in
590          * the chunk, we should vectorize along nz, that is, perform the
591          * mat-vec one row at a time as in the usual CSR case. */
592         if (nz > isize) {
593 #if defined(PETSC_HAVE_CRAY_VECTOR)
594 #pragma _CRI preferstream
595 #endif
596           for (i=0; i<isize; i++) {
597 #if defined(PETSC_HAVE_CRAY_VECTOR)
598 #pragma _CRI prefervector
599 #endif
600             for (j=0; j<nz; j++) {
601               ipos   = ip[i] + j;
602               yp[i] += aa[ipos] * x[aj[ipos]];
603             }
604           }
605         }
606         /* Otherwise, there are enough rows in the chunk to make it
607          * worthwhile to vectorize across the rows, that is, to do the
608          * matvec by operating with "columns" of the chunk. */
609         else {
610           for (j=0; j<nz; j++) {
611             for (i=0; i<isize; i++) {
612               ipos   = ip[i] + j;
613               yp[i] += aa[ipos] * x[aj[ipos]];
614             }
615           }
616         }
617 
618 #if defined(PETSC_HAVE_CRAY_VECTOR)
619 #pragma _CRI ivdep
620 #endif
621         /* Put results from yp[] into non-permuted result vector y. */
622         for (i=0; i<isize; i++) {
623           y[iperm[istart+i]] = yp[i];
624         }
625       } /* End processing chunk of isize rows of a group. */
626 
627     } /* End handling matvec for chunk with nz > 1. */
628   } /* End loop over igroup. */
629 
630 #endif
631   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
632   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
633   ierr = VecRestoreArrayPair(yy,ww,&y,&w);CHKERRQ(ierr);
634   PetscFunctionReturn(0);
635 }
636 
637 
638 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
639  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
640  * routine, but can also be used to convert an assembled SeqAIJ matrix
641  * into a SeqAIJPERM one. */
642 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat)
643 {
644   PetscErrorCode ierr;
645   Mat            B = *newmat;
646   Mat_SeqAIJPERM *aijperm;
647   PetscBool      sametype;
648 
649   PetscFunctionBegin;
650   if (reuse == MAT_INITIAL_MATRIX) {
651     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
652   }
653   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
654   if (sametype) PetscFunctionReturn(0);
655 
656   ierr     = PetscNewLog(B,&aijperm);CHKERRQ(ierr);
657   B->spptr = (void*) aijperm;
658 
659   /* Set function pointers for methods that we inherit from AIJ but override. */
660   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
661   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
662   B->ops->destroy     = MatDestroy_SeqAIJPERM;
663   B->ops->mult        = MatMult_SeqAIJPERM;
664   B->ops->multadd     = MatMultAdd_SeqAIJPERM;
665 
666   aijperm->nonzerostate = -1;  /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
667   /* If A has already been assembled, compute the permutation. */
668   if (A->assembled) {
669     ierr = MatSeqAIJPERM_create_perm(B);CHKERRQ(ierr);
670   }
671 
672   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr);
673   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
674   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
675   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
676 
677   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);CHKERRQ(ierr);
678   *newmat = B;
679   PetscFunctionReturn(0);
680 }
681 
682 /*@C
683    MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM.
684    This type inherits from AIJ, but calculates some additional permutation
685    information that is used to allow better vectorization of some
686    operations.  At the cost of increased storage, the AIJ formatted
687    matrix can be copied to a format in which pieces of the matrix are
688    stored in ELLPACK format, allowing the vectorized matrix multiply
689    routine to use stride-1 memory accesses.  As with the AIJ type, it is
690    important to preallocate matrix storage in order to get good assembly
691    performance.
692 
693    Collective on MPI_Comm
694 
695    Input Parameters:
696 +  comm - MPI communicator, set to PETSC_COMM_SELF
697 .  m - number of rows
698 .  n - number of columns
699 .  nz - number of nonzeros per row (same for all rows)
700 -  nnz - array containing the number of nonzeros in the various rows
701          (possibly different for each row) or NULL
702 
703    Output Parameter:
704 .  A - the matrix
705 
706    Notes:
707    If nnz is given then nz is ignored
708 
709    Level: intermediate
710 
711 .keywords: matrix, cray, sparse, parallel
712 
713 .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues()
714 @*/
715 PetscErrorCode  MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
716 {
717   PetscErrorCode ierr;
718 
719   PetscFunctionBegin;
720   ierr = MatCreate(comm,A);CHKERRQ(ierr);
721   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
722   ierr = MatSetType(*A,MATSEQAIJPERM);CHKERRQ(ierr);
723   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
724   PetscFunctionReturn(0);
725 }
726 
727 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
728 {
729   PetscErrorCode ierr;
730 
731   PetscFunctionBegin;
732   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
733   ierr = MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
734   PetscFunctionReturn(0);
735 }
736 
737