xref: /petsc/src/mat/impls/aij/seq/aijperm/aijperm.c (revision c87ba875e4007ad659b117ea274f03d5f4cd5ea7)
1 
2 /*
3   Defines basic operations for the MATSEQAIJPERM matrix class.
4   This class is derived from the MATSEQAIJ class and retains the
5   compressed row storage (aka Yale sparse matrix format) but augments
6   it with some permutation information that enables some operations
7   to be more vectorizable.  A physically rearranged copy of the matrix
8   may be stored if the user desires.
9 
10   Eventually a variety of permutations may be supported.
11 */
12 
13 #include <../src/mat/impls/aij/seq/aij.h>
14 
15 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16 #include <immintrin.h>
17 
18 #if !defined(_MM_SCALE_8)
19 #define _MM_SCALE_8    8
20 #endif
21 #if !defined(_MM_SCALE_4)
22 #define _MM_SCALE_4    4
23 #endif
24 #endif
25 
26 #define NDIM 512
27 /* NDIM specifies how many rows at a time we should work with when
28  * performing the vectorized mat-vec.  This depends on various factors
29  * such as vector register length, etc., and I really need to add a
30  * way for the user (or the library) to tune this.  I'm setting it to
31  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32  * routines. */
33 
34 typedef struct {
35   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
36 
37   PetscInt         ngroup;
38   PetscInt         *xgroup;
39   /* Denotes where groups of rows with same number of nonzeros
40    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41    * where the ith group begins. */
42 
43   PetscInt         *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
44   PetscInt         *iperm;  /* The permutation vector. */
45 
46   /* Some of this stuff is for Ed's recursive triangular solve.
47    * I'm not sure what I need yet. */
48   PetscInt         blocksize;
49   PetscInt         nstep;
50   PetscInt         *jstart_list;
51   PetscInt         *jend_list;
52   PetscInt         *action_list;
53   PetscInt         *ngroup_list;
54   PetscInt         **ipointer_list;
55   PetscInt         **xgroup_list;
56   PetscInt         **nzgroup_list;
57   PetscInt         **iperm_list;
58 } Mat_SeqAIJPERM;
59 
60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
61 {
62   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
63   /* so we will ignore 'MatType type'. */
64   PetscErrorCode ierr;
65   Mat            B       = *newmat;
66   Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr;
67 
68   PetscFunctionBegin;
69   if (reuse == MAT_INITIAL_MATRIX) {
70     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
71     aijperm=(Mat_SeqAIJPERM*)B->spptr;
72   }
73 
74   /* Reset the original function pointers. */
75   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
76   B->ops->destroy     = MatDestroy_SeqAIJ;
77   B->ops->duplicate   = MatDuplicate_SeqAIJ;
78   B->ops->mult        = MatMult_SeqAIJ;
79   B->ops->multadd     = MatMultAdd_SeqAIJ;
80 
81   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);CHKERRQ(ierr);
82   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr);
83   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr);
84 
85   /* Free everything in the Mat_SeqAIJPERM data structure.*/
86   ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
87   ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
88   ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
89   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
90 
91   /* Change the type of B to MATSEQAIJ. */
92   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
93 
94   *newmat = B;
95   PetscFunctionReturn(0);
96 }
97 
98 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
99 {
100   PetscErrorCode ierr;
101   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
102 
103   PetscFunctionBegin;
104   if (aijperm) {
105     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
106     ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
107     ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
108     ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
109     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
110   }
111   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
112    * to destroy everything that remains. */
113   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
114   /* Note that I don't call MatSetType().  I believe this is because that
115    * is only to be called when *building* a matrix.  I could be wrong, but
116    * that is how things work for the SuperLU matrix class. */
117   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
118   PetscFunctionReturn(0);
119 }
120 
121 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
122 {
123   PetscErrorCode ierr;
124   Mat_SeqAIJPERM *aijperm      = (Mat_SeqAIJPERM*) A->spptr;
125   Mat_SeqAIJPERM *aijperm_dest;
126   PetscBool      perm;
127 
128   PetscFunctionBegin;
129   ierr        = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
130   ierr        = PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);CHKERRQ(ierr);
131   if (perm) {
132     aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr;
133     ierr = PetscFree(aijperm_dest->xgroup);CHKERRQ(ierr);
134     ierr = PetscFree(aijperm_dest->nzgroup);CHKERRQ(ierr);
135     ierr = PetscFree(aijperm_dest->iperm);CHKERRQ(ierr);
136   } else {
137     ierr        = PetscNewLog(*M,&aijperm_dest);CHKERRQ(ierr);
138     (*M)->spptr = (void*) aijperm_dest;
139     ierr        = PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);CHKERRQ(ierr);
140     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr);
141     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
142     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
143   }
144   ierr        = PetscArraycpy(aijperm_dest,aijperm,1);CHKERRQ(ierr);
145   /* Allocate space for, and copy the grouping and permutation info.
146    * I note that when the groups are initially determined in
147    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
148    * necessary.  But at this point, we know how large they need to be, and
149    * allocate only the necessary amount of memory.  So the duplicated matrix
150    * may actually use slightly less storage than the original! */
151   ierr = PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);CHKERRQ(ierr);
152   ierr = PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);CHKERRQ(ierr);
153   ierr = PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);CHKERRQ(ierr);
154   ierr = PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n);CHKERRQ(ierr);
155   ierr = PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1);CHKERRQ(ierr);
156   ierr = PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup);CHKERRQ(ierr);
157   PetscFunctionReturn(0);
158 }
159 
160 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
161 {
162   PetscErrorCode ierr;
163   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)(A)->data;
164   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
165   PetscInt       m;       /* Number of rows in the matrix. */
166   PetscInt       *ia;       /* From the CSR representation; points to the beginning  of each row. */
167   PetscInt       maxnz;      /* Maximum number of nonzeros in any row. */
168   PetscInt       *rows_in_bucket;
169   /* To construct the permutation, we sort each row into one of maxnz
170    * buckets based on how many nonzeros are in the row. */
171   PetscInt       nz;
172   PetscInt       *nz_in_row;         /* the number of nonzero elements in row k. */
173   PetscInt       *ipnz;
174   /* When constructing the iperm permutation vector,
175    * ipnz[nz] is used to point to the next place in the permutation vector
176    * that a row with nz nonzero elements should be placed.*/
177   PetscInt       i, ngroup, istart, ipos;
178 
179   PetscFunctionBegin;
180   if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */
181   aijperm->nonzerostate = A->nonzerostate;
182  /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
183   ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
184   ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
185   ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
186 
187   m  = A->rmap->n;
188   ia = a->i;
189 
190   /* Allocate the arrays that will hold the permutation vector. */
191   ierr = PetscMalloc1(m, &aijperm->iperm);CHKERRQ(ierr);
192 
193   /* Allocate some temporary work arrays that will be used in
194    * calculating the permuation vector and groupings. */
195   ierr = PetscMalloc1(m, &nz_in_row);CHKERRQ(ierr);
196 
197   /* Now actually figure out the permutation and grouping. */
198 
199   /* First pass: Determine number of nonzeros in each row, maximum
200    * number of nonzeros in any row, and how many rows fall into each
201    * "bucket" of rows with same number of nonzeros. */
202   maxnz = 0;
203   for (i=0; i<m; i++) {
204     nz_in_row[i] = ia[i+1]-ia[i];
205     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
206   }
207   ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);CHKERRQ(ierr);
208   ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);CHKERRQ(ierr);
209 
210   for (i=0; i<=maxnz; i++) {
211     rows_in_bucket[i] = 0;
212   }
213   for (i=0; i<m; i++) {
214     nz = nz_in_row[i];
215     rows_in_bucket[nz]++;
216   }
217 
218   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
219    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
220    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
221    * then xgroup[] must consist of (maxnz + 2) elements, since the last
222    * element of xgroup will tell us where the (maxnz + 1)th group ends.
223    * We allocate space for the maximum number of groups;
224    * that is potentially a little wasteful, but not too much so.
225    * Perhaps I should fix it later. */
226   ierr = PetscMalloc1(maxnz+2, &aijperm->xgroup);CHKERRQ(ierr);
227   ierr = PetscMalloc1(maxnz+1, &aijperm->nzgroup);CHKERRQ(ierr);
228 
229   /* Second pass.  Look at what is in the buckets and create the groupings.
230    * Note that it is OK to have a group of rows with no non-zero values. */
231   ngroup = 0;
232   istart = 0;
233   for (i=0; i<=maxnz; i++) {
234     if (rows_in_bucket[i] > 0) {
235       aijperm->nzgroup[ngroup] = i;
236       aijperm->xgroup[ngroup]  = istart;
237       ngroup++;
238       istart += rows_in_bucket[i];
239     }
240   }
241 
242   aijperm->xgroup[ngroup] = istart;
243   aijperm->ngroup         = ngroup;
244 
245   /* Now fill in the permutation vector iperm. */
246   ipnz[0] = 0;
247   for (i=0; i<maxnz; i++) {
248     ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
249   }
250 
251   for (i=0; i<m; i++) {
252     nz                   = nz_in_row[i];
253     ipos                 = ipnz[nz];
254     aijperm->iperm[ipos] = i;
255     ipnz[nz]++;
256   }
257 
258   /* Clean up temporary work arrays. */
259   ierr = PetscFree(rows_in_bucket);CHKERRQ(ierr);
260   ierr = PetscFree(ipnz);CHKERRQ(ierr);
261   ierr = PetscFree(nz_in_row);CHKERRQ(ierr);
262   PetscFunctionReturn(0);
263 }
264 
265 
266 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
267 {
268   PetscErrorCode ierr;
269   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
270 
271   PetscFunctionBegin;
272   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
273 
274   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
275    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
276    * I'm not sure if this is the best way to do this, but it avoids
277    * a lot of code duplication.
278    * I also note that currently MATSEQAIJPERM doesn't know anything about
279    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
280    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
281    * this, this may break things.  (Don't know... haven't looked at it.) */
282   a->inode.use = PETSC_FALSE;
283   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
284 
285   /* Now calculate the permutation and grouping information. */
286   ierr = MatSeqAIJPERM_create_perm(A);CHKERRQ(ierr);
287   PetscFunctionReturn(0);
288 }
289 
290 PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)
291 {
292   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
293   const PetscScalar *x;
294   PetscScalar       *y;
295   const MatScalar   *aa;
296   PetscErrorCode    ierr;
297   const PetscInt    *aj,*ai;
298 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
299   PetscInt          i,j;
300 #endif
301 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
302   __m512d           vec_x,vec_y,vec_vals;
303   __m256i           vec_idx,vec_ipos,vec_j;
304   __mmask8           mask;
305 #endif
306 
307   /* Variables that don't appear in MatMult_SeqAIJ. */
308   Mat_SeqAIJPERM    *aijperm = (Mat_SeqAIJPERM*) A->spptr;
309   PetscInt          *iperm;  /* Points to the permutation vector. */
310   PetscInt          *xgroup;
311   /* Denotes where groups of rows with same number of nonzeros
312    * begin and end in iperm. */
313   PetscInt          *nzgroup;
314   PetscInt          ngroup;
315   PetscInt          igroup;
316   PetscInt          jstart,jend;
317   /* jstart is used in loops to denote the position in iperm where a
318    * group starts; jend denotes the position where it ends.
319    * (jend + 1 is where the next group starts.) */
320   PetscInt          iold,nz;
321   PetscInt          istart,iend,isize;
322   PetscInt          ipos;
323   PetscScalar       yp[NDIM];
324   PetscInt          ip[NDIM];    /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
325 
326 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
327 #pragma disjoint(*x,*y,*aa)
328 #endif
329 
330   PetscFunctionBegin;
331   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
332   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
333   aj   = a->j;   /* aj[k] gives column index for element aa[k]. */
334   aa   = a->a; /* Nonzero elements stored row-by-row. */
335   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
336 
337   /* Get the info we need about the permutations and groupings. */
338   iperm   = aijperm->iperm;
339   ngroup  = aijperm->ngroup;
340   xgroup  = aijperm->xgroup;
341   nzgroup = aijperm->nzgroup;
342 
343 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
344   fortranmultaijperm_(&m,x,ii,aj,aa,y);
345 #else
346 
347   for (igroup=0; igroup<ngroup; igroup++) {
348     jstart = xgroup[igroup];
349     jend   = xgroup[igroup+1] - 1;
350     nz     = nzgroup[igroup];
351 
352     /* Handle the special cases where the number of nonzeros per row
353      * in the group is either 0 or 1. */
354     if (nz == 0) {
355       for (i=jstart; i<=jend; i++) {
356         y[iperm[i]] = 0.0;
357       }
358     } else if (nz == 1) {
359       for (i=jstart; i<=jend; i++) {
360         iold    = iperm[i];
361         ipos    = ai[iold];
362         y[iold] = aa[ipos] * x[aj[ipos]];
363       }
364     } else {
365 
366       /* We work our way through the current group in chunks of NDIM rows
367        * at a time. */
368 
369       for (istart=jstart; istart<=jend; istart+=NDIM) {
370         /* Figure out where the chunk of 'isize' rows ends in iperm.
371          * 'isize may of course be less than NDIM for the last chunk. */
372         iend = istart + (NDIM - 1);
373 
374         if (iend > jend) iend = jend;
375 
376         isize = iend - istart + 1;
377 
378         /* Initialize the yp[] array that will be used to hold part of
379          * the permuted results vector, and figure out where in aa each
380          * row of the chunk will begin. */
381         for (i=0; i<isize; i++) {
382           iold = iperm[istart + i];
383           /* iold is a row number from the matrix A *before* reordering. */
384           ip[i] = ai[iold];
385           /* ip[i] tells us where the ith row of the chunk begins in aa. */
386           yp[i] = (PetscScalar) 0.0;
387         }
388 
389         /* If the number of zeros per row exceeds the number of rows in
390          * the chunk, we should vectorize along nz, that is, perform the
391          * mat-vec one row at a time as in the usual CSR case. */
392         if (nz > isize) {
393 #if defined(PETSC_HAVE_CRAY_VECTOR)
394 #pragma _CRI preferstream
395 #endif
396           for (i=0; i<isize; i++) {
397 #if defined(PETSC_HAVE_CRAY_VECTOR)
398 #pragma _CRI prefervector
399 #endif
400 
401 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
402             vec_y = _mm512_setzero_pd();
403             ipos = ip[i];
404             for (j=0; j<(nz>>3); j++) {
405               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
406               vec_vals = _mm512_loadu_pd(&aa[ipos]);
407               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
408               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
409               ipos += 8;
410             }
411             if ((nz&0x07)>2) {
412               mask     = (__mmask8)(0xff >> (8-(nz&0x07)));
413               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
414               vec_vals = _mm512_loadu_pd(&aa[ipos]);
415               vec_x    = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8);
416               vec_y    = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask);
417             } else if ((nz&0x07)==2) {
418               yp[i] += aa[ipos]*x[aj[ipos]];
419               yp[i] += aa[ipos+1]*x[aj[ipos+1]];
420             } else if ((nz&0x07)==1) {
421               yp[i] += aa[ipos]*x[aj[ipos]];
422             }
423             yp[i] += _mm512_reduce_add_pd(vec_y);
424 #else
425             for (j=0; j<nz; j++) {
426               ipos   = ip[i] + j;
427               yp[i] += aa[ipos] * x[aj[ipos]];
428             }
429 #endif
430           }
431         } else {
432           /* Otherwise, there are enough rows in the chunk to make it
433            * worthwhile to vectorize across the rows, that is, to do the
434            * matvec by operating with "columns" of the chunk. */
435           for (j=0; j<nz; j++) {
436 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
437             vec_j = _mm256_set1_epi32(j);
438             for (i=0; i<((isize>>3)<<3); i+=8) {
439               vec_y    = _mm512_loadu_pd(&yp[i]);
440               vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]);
441               vec_ipos = _mm256_add_epi32(vec_ipos,vec_j);
442               vec_idx  = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4);
443               vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8);
444               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
445               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
446               _mm512_storeu_pd(&yp[i],vec_y);
447             }
448             for (i=isize-(isize&0x07); i<isize; i++) {
449               ipos = ip[i]+j;
450               yp[i] += aa[ipos]*x[aj[ipos]];
451             }
452 #else
453             for (i=0; i<isize; i++) {
454               ipos   = ip[i] + j;
455               yp[i] += aa[ipos] * x[aj[ipos]];
456             }
457 #endif
458           }
459         }
460 
461 #if defined(PETSC_HAVE_CRAY_VECTOR)
462 #pragma _CRI ivdep
463 #endif
464         /* Put results from yp[] into non-permuted result vector y. */
465         for (i=0; i<isize; i++) {
466           y[iperm[istart+i]] = yp[i];
467         }
468       } /* End processing chunk of isize rows of a group. */
469     } /* End handling matvec for chunk with nz > 1. */
470   } /* End loop over igroup. */
471 #endif
472   ierr = PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));CHKERRQ(ierr);
473   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
474   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
475   PetscFunctionReturn(0);
476 }
477 
478 
479 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
480  * Note that the names I used to designate the vectors differs from that
481  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
482  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
483 /*
484     I hate having virtually identical code for the mult and the multadd!!!
485 */
486 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)
487 {
488   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
489   const PetscScalar *x;
490   PetscScalar       *y,*w;
491   const MatScalar   *aa;
492   PetscErrorCode    ierr;
493   const PetscInt    *aj,*ai;
494 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
495   PetscInt i,j;
496 #endif
497 
498   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
499   Mat_SeqAIJPERM * aijperm;
500   PetscInt       *iperm;    /* Points to the permutation vector. */
501   PetscInt       *xgroup;
502   /* Denotes where groups of rows with same number of nonzeros
503    * begin and end in iperm. */
504   PetscInt *nzgroup;
505   PetscInt ngroup;
506   PetscInt igroup;
507   PetscInt jstart,jend;
508   /* jstart is used in loops to denote the position in iperm where a
509    * group starts; jend denotes the position where it ends.
510    * (jend + 1 is where the next group starts.) */
511   PetscInt    iold,nz;
512   PetscInt    istart,iend,isize;
513   PetscInt    ipos;
514   PetscScalar yp[NDIM];
515   PetscInt    ip[NDIM];
516   /* yp[] and ip[] are treated as vector "registers" for performing
517    * the mat-vec. */
518 
519 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
520 #pragma disjoint(*x,*y,*aa)
521 #endif
522 
523   PetscFunctionBegin;
524   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
525   ierr = VecGetArrayPair(yy,ww,&y,&w);CHKERRQ(ierr);
526 
527   aj = a->j;   /* aj[k] gives column index for element aa[k]. */
528   aa = a->a;   /* Nonzero elements stored row-by-row. */
529   ai = a->i;   /* ai[k] is the position in aa and aj where row k starts. */
530 
531   /* Get the info we need about the permutations and groupings. */
532   aijperm = (Mat_SeqAIJPERM*) A->spptr;
533   iperm   = aijperm->iperm;
534   ngroup  = aijperm->ngroup;
535   xgroup  = aijperm->xgroup;
536   nzgroup = aijperm->nzgroup;
537 
538 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
539   fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w);
540 #else
541 
542   for (igroup=0; igroup<ngroup; igroup++) {
543     jstart = xgroup[igroup];
544     jend   = xgroup[igroup+1] - 1;
545 
546     nz = nzgroup[igroup];
547 
548     /* Handle the special cases where the number of nonzeros per row
549      * in the group is either 0 or 1. */
550     if (nz == 0) {
551       for (i=jstart; i<=jend; i++) {
552         iold    = iperm[i];
553         y[iold] = w[iold];
554       }
555     }
556     else if (nz == 1) {
557       for (i=jstart; i<=jend; i++) {
558         iold    = iperm[i];
559         ipos    = ai[iold];
560         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
561       }
562     }
563     /* For the general case: */
564     else {
565 
566       /* We work our way through the current group in chunks of NDIM rows
567        * at a time. */
568 
569       for (istart=jstart; istart<=jend; istart+=NDIM) {
570         /* Figure out where the chunk of 'isize' rows ends in iperm.
571          * 'isize may of course be less than NDIM for the last chunk. */
572         iend = istart + (NDIM - 1);
573         if (iend > jend) iend = jend;
574         isize = iend - istart + 1;
575 
576         /* Initialize the yp[] array that will be used to hold part of
577          * the permuted results vector, and figure out where in aa each
578          * row of the chunk will begin. */
579         for (i=0; i<isize; i++) {
580           iold = iperm[istart + i];
581           /* iold is a row number from the matrix A *before* reordering. */
582           ip[i] = ai[iold];
583           /* ip[i] tells us where the ith row of the chunk begins in aa. */
584           yp[i] = w[iold];
585         }
586 
587         /* If the number of zeros per row exceeds the number of rows in
588          * the chunk, we should vectorize along nz, that is, perform the
589          * mat-vec one row at a time as in the usual CSR case. */
590         if (nz > isize) {
591 #if defined(PETSC_HAVE_CRAY_VECTOR)
592 #pragma _CRI preferstream
593 #endif
594           for (i=0; i<isize; i++) {
595 #if defined(PETSC_HAVE_CRAY_VECTOR)
596 #pragma _CRI prefervector
597 #endif
598             for (j=0; j<nz; j++) {
599               ipos   = ip[i] + j;
600               yp[i] += aa[ipos] * x[aj[ipos]];
601             }
602           }
603         }
604         /* Otherwise, there are enough rows in the chunk to make it
605          * worthwhile to vectorize across the rows, that is, to do the
606          * matvec by operating with "columns" of the chunk. */
607         else {
608           for (j=0; j<nz; j++) {
609             for (i=0; i<isize; i++) {
610               ipos   = ip[i] + j;
611               yp[i] += aa[ipos] * x[aj[ipos]];
612             }
613           }
614         }
615 
616 #if defined(PETSC_HAVE_CRAY_VECTOR)
617 #pragma _CRI ivdep
618 #endif
619         /* Put results from yp[] into non-permuted result vector y. */
620         for (i=0; i<isize; i++) {
621           y[iperm[istart+i]] = yp[i];
622         }
623       } /* End processing chunk of isize rows of a group. */
624 
625     } /* End handling matvec for chunk with nz > 1. */
626   } /* End loop over igroup. */
627 
628 #endif
629   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
630   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
631   ierr = VecRestoreArrayPair(yy,ww,&y,&w);CHKERRQ(ierr);
632   PetscFunctionReturn(0);
633 }
634 
635 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
636  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
637  * routine, but can also be used to convert an assembled SeqAIJ matrix
638  * into a SeqAIJPERM one. */
639 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat)
640 {
641   PetscErrorCode ierr;
642   Mat            B = *newmat;
643   Mat_SeqAIJPERM *aijperm;
644   PetscBool      sametype;
645 
646   PetscFunctionBegin;
647   if (reuse == MAT_INITIAL_MATRIX) {
648     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
649   }
650   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
651   if (sametype) PetscFunctionReturn(0);
652 
653   ierr     = PetscNewLog(B,&aijperm);CHKERRQ(ierr);
654   B->spptr = (void*) aijperm;
655 
656   /* Set function pointers for methods that we inherit from AIJ but override. */
657   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
658   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
659   B->ops->destroy     = MatDestroy_SeqAIJPERM;
660   B->ops->mult        = MatMult_SeqAIJPERM;
661   B->ops->multadd     = MatMultAdd_SeqAIJPERM;
662 
663   aijperm->nonzerostate = -1;  /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
664   /* If A has already been assembled, compute the permutation. */
665   if (A->assembled) {
666     ierr = MatSeqAIJPERM_create_perm(B);CHKERRQ(ierr);
667   }
668 
669   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr);
670   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
671   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
672 
673   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);CHKERRQ(ierr);
674   *newmat = B;
675   PetscFunctionReturn(0);
676 }
677 
678 /*@C
679    MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM.
680    This type inherits from AIJ, but calculates some additional permutation
681    information that is used to allow better vectorization of some
682    operations.  At the cost of increased storage, the AIJ formatted
683    matrix can be copied to a format in which pieces of the matrix are
684    stored in ELLPACK format, allowing the vectorized matrix multiply
685    routine to use stride-1 memory accesses.  As with the AIJ type, it is
686    important to preallocate matrix storage in order to get good assembly
687    performance.
688 
689    Collective
690 
691    Input Parameters:
692 +  comm - MPI communicator, set to PETSC_COMM_SELF
693 .  m - number of rows
694 .  n - number of columns
695 .  nz - number of nonzeros per row (same for all rows)
696 -  nnz - array containing the number of nonzeros in the various rows
697          (possibly different for each row) or NULL
698 
699    Output Parameter:
700 .  A - the matrix
701 
702    Notes:
703    If nnz is given then nz is ignored
704 
705    Level: intermediate
706 
707 .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues()
708 @*/
709 PetscErrorCode  MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
710 {
711   PetscErrorCode ierr;
712 
713   PetscFunctionBegin;
714   ierr = MatCreate(comm,A);CHKERRQ(ierr);
715   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
716   ierr = MatSetType(*A,MATSEQAIJPERM);CHKERRQ(ierr);
717   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
718   PetscFunctionReturn(0);
719 }
720 
721 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
722 {
723   PetscErrorCode ierr;
724 
725   PetscFunctionBegin;
726   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
727   ierr = MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
728   PetscFunctionReturn(0);
729 }
730 
731