xref: /petsc/src/mat/impls/aij/seq/aijperm/aijperm.c (revision ccb4e88a40f0b86eaeca07ff64c64e4de2fae686)
1 
2 /*
3   Defines basic operations for the MATSEQAIJPERM matrix class.
4   This class is derived from the MATSEQAIJ class and retains the
5   compressed row storage (aka Yale sparse matrix format) but augments
6   it with some permutation information that enables some operations
7   to be more vectorizable.  A physically rearranged copy of the matrix
8   may be stored if the user desires.
9 
10   Eventually a variety of permutations may be supported.
11 */
12 
13 #include <../src/mat/impls/aij/seq/aij.h>
14 
15 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16 #include <immintrin.h>
17 
18 #if !defined(_MM_SCALE_8)
19 #define _MM_SCALE_8    8
20 #endif
21 #if !defined(_MM_SCALE_4)
22 #define _MM_SCALE_4    4
23 #endif
24 #endif
25 
26 #define NDIM 512
27 /* NDIM specifies how many rows at a time we should work with when
28  * performing the vectorized mat-vec.  This depends on various factors
29  * such as vector register length, etc., and I really need to add a
30  * way for the user (or the library) to tune this.  I'm setting it to
31  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32  * routines. */
33 
34 typedef struct {
35   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
36 
37   PetscInt         ngroup;
38   PetscInt         *xgroup;
39   /* Denotes where groups of rows with same number of nonzeros
40    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41    * where the ith group begins. */
42 
43   PetscInt         *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
44   PetscInt         *iperm;  /* The permutation vector. */
45 
46   /* Some of this stuff is for Ed's recursive triangular solve.
47    * I'm not sure what I need yet. */
48   PetscInt         blocksize;
49   PetscInt         nstep;
50   PetscInt         *jstart_list;
51   PetscInt         *jend_list;
52   PetscInt         *action_list;
53   PetscInt         *ngroup_list;
54   PetscInt         **ipointer_list;
55   PetscInt         **xgroup_list;
56   PetscInt         **nzgroup_list;
57   PetscInt         **iperm_list;
58 } Mat_SeqAIJPERM;
59 
60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
61 {
62   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
63   /* so we will ignore 'MatType type'. */
64   PetscErrorCode ierr;
65   Mat            B       = *newmat;
66   Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr;
67 
68   PetscFunctionBegin;
69   if (reuse == MAT_INITIAL_MATRIX) {
70     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
71     aijperm=(Mat_SeqAIJPERM*)B->spptr;
72   }
73 
74   /* Reset the original function pointers. */
75   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
76   B->ops->destroy     = MatDestroy_SeqAIJ;
77   B->ops->duplicate   = MatDuplicate_SeqAIJ;
78   B->ops->mult        = MatMult_SeqAIJ;
79   B->ops->multadd     = MatMultAdd_SeqAIJ;
80 
81   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);CHKERRQ(ierr);
82 
83   /* Free everything in the Mat_SeqAIJPERM data structure.*/
84   ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
85   ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
86   ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
87   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
88 
89   /* Change the type of B to MATSEQAIJ. */
90   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
91 
92   *newmat = B;
93   PetscFunctionReturn(0);
94 }
95 
96 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
97 {
98   PetscErrorCode ierr;
99   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
100 
101   PetscFunctionBegin;
102   if (aijperm) {
103     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
104     ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
105     ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
106     ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
107     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
108   }
109   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
110    * to destroy everything that remains. */
111   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
112   /* Note that I don't call MatSetType().  I believe this is because that
113    * is only to be called when *building* a matrix.  I could be wrong, but
114    * that is how things work for the SuperLU matrix class. */
115   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
116   PetscFunctionReturn(0);
117 }
118 
119 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
120 {
121   PetscErrorCode ierr;
122   Mat_SeqAIJPERM *aijperm      = (Mat_SeqAIJPERM*) A->spptr;
123   Mat_SeqAIJPERM *aijperm_dest;
124   PetscBool      perm;
125 
126   PetscFunctionBegin;
127   ierr        = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
128   ierr        = PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);CHKERRQ(ierr);
129   if (perm) {
130     aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr;
131     ierr = PetscFree(aijperm_dest->xgroup);CHKERRQ(ierr);
132     ierr = PetscFree(aijperm_dest->nzgroup);CHKERRQ(ierr);
133     ierr = PetscFree(aijperm_dest->iperm);CHKERRQ(ierr);
134   } else {
135     ierr        = PetscNewLog(*M,&aijperm_dest);CHKERRQ(ierr);
136     (*M)->spptr = (void*) aijperm_dest;
137     ierr        = PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);CHKERRQ(ierr);
138     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr);
139   }
140   ierr        = PetscArraycpy(aijperm_dest,aijperm,1);CHKERRQ(ierr);
141   /* Allocate space for, and copy the grouping and permutation info.
142    * I note that when the groups are initially determined in
143    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
144    * necessary.  But at this point, we know how large they need to be, and
145    * allocate only the necessary amount of memory.  So the duplicated matrix
146    * may actually use slightly less storage than the original! */
147   ierr = PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);CHKERRQ(ierr);
148   ierr = PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);CHKERRQ(ierr);
149   ierr = PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);CHKERRQ(ierr);
150   ierr = PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n);CHKERRQ(ierr);
151   ierr = PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1);CHKERRQ(ierr);
152   ierr = PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup);CHKERRQ(ierr);
153   PetscFunctionReturn(0);
154 }
155 
156 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
157 {
158   PetscErrorCode ierr;
159   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)(A)->data;
160   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
161   PetscInt       m;       /* Number of rows in the matrix. */
162   PetscInt       *ia;       /* From the CSR representation; points to the beginning  of each row. */
163   PetscInt       maxnz;      /* Maximum number of nonzeros in any row. */
164   PetscInt       *rows_in_bucket;
165   /* To construct the permutation, we sort each row into one of maxnz
166    * buckets based on how many nonzeros are in the row. */
167   PetscInt       nz;
168   PetscInt       *nz_in_row;         /* the number of nonzero elements in row k. */
169   PetscInt       *ipnz;
170   /* When constructing the iperm permutation vector,
171    * ipnz[nz] is used to point to the next place in the permutation vector
172    * that a row with nz nonzero elements should be placed.*/
173   PetscInt       i, ngroup, istart, ipos;
174 
175   PetscFunctionBegin;
176   if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */
177   aijperm->nonzerostate = A->nonzerostate;
178  /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
179   ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
180   ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
181   ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
182 
183   m  = A->rmap->n;
184   ia = a->i;
185 
186   /* Allocate the arrays that will hold the permutation vector. */
187   ierr = PetscMalloc1(m, &aijperm->iperm);CHKERRQ(ierr);
188 
189   /* Allocate some temporary work arrays that will be used in
190    * calculating the permuation vector and groupings. */
191   ierr = PetscMalloc1(m, &nz_in_row);CHKERRQ(ierr);
192 
193   /* Now actually figure out the permutation and grouping. */
194 
195   /* First pass: Determine number of nonzeros in each row, maximum
196    * number of nonzeros in any row, and how many rows fall into each
197    * "bucket" of rows with same number of nonzeros. */
198   maxnz = 0;
199   for (i=0; i<m; i++) {
200     nz_in_row[i] = ia[i+1]-ia[i];
201     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
202   }
203   ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);CHKERRQ(ierr);
204   ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);CHKERRQ(ierr);
205 
206   for (i=0; i<=maxnz; i++) {
207     rows_in_bucket[i] = 0;
208   }
209   for (i=0; i<m; i++) {
210     nz = nz_in_row[i];
211     rows_in_bucket[nz]++;
212   }
213 
214   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
215    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
216    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
217    * then xgroup[] must consist of (maxnz + 2) elements, since the last
218    * element of xgroup will tell us where the (maxnz + 1)th group ends.
219    * We allocate space for the maximum number of groups;
220    * that is potentially a little wasteful, but not too much so.
221    * Perhaps I should fix it later. */
222   ierr = PetscMalloc1(maxnz+2, &aijperm->xgroup);CHKERRQ(ierr);
223   ierr = PetscMalloc1(maxnz+1, &aijperm->nzgroup);CHKERRQ(ierr);
224 
225   /* Second pass.  Look at what is in the buckets and create the groupings.
226    * Note that it is OK to have a group of rows with no non-zero values. */
227   ngroup = 0;
228   istart = 0;
229   for (i=0; i<=maxnz; i++) {
230     if (rows_in_bucket[i] > 0) {
231       aijperm->nzgroup[ngroup] = i;
232       aijperm->xgroup[ngroup]  = istart;
233       ngroup++;
234       istart += rows_in_bucket[i];
235     }
236   }
237 
238   aijperm->xgroup[ngroup] = istart;
239   aijperm->ngroup         = ngroup;
240 
241   /* Now fill in the permutation vector iperm. */
242   ipnz[0] = 0;
243   for (i=0; i<maxnz; i++) {
244     ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
245   }
246 
247   for (i=0; i<m; i++) {
248     nz                   = nz_in_row[i];
249     ipos                 = ipnz[nz];
250     aijperm->iperm[ipos] = i;
251     ipnz[nz]++;
252   }
253 
254   /* Clean up temporary work arrays. */
255   ierr = PetscFree(rows_in_bucket);CHKERRQ(ierr);
256   ierr = PetscFree(ipnz);CHKERRQ(ierr);
257   ierr = PetscFree(nz_in_row);CHKERRQ(ierr);
258   PetscFunctionReturn(0);
259 }
260 
261 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
262 {
263   PetscErrorCode ierr;
264   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
265 
266   PetscFunctionBegin;
267   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
268 
269   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
270    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
271    * I'm not sure if this is the best way to do this, but it avoids
272    * a lot of code duplication.
273    * I also note that currently MATSEQAIJPERM doesn't know anything about
274    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
275    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
276    * this, this may break things.  (Don't know... haven't looked at it.) */
277   a->inode.use = PETSC_FALSE;
278   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
279 
280   /* Now calculate the permutation and grouping information. */
281   ierr = MatSeqAIJPERM_create_perm(A);CHKERRQ(ierr);
282   PetscFunctionReturn(0);
283 }
284 
285 PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)
286 {
287   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
288   const PetscScalar *x;
289   PetscScalar       *y;
290   const MatScalar   *aa;
291   PetscErrorCode    ierr;
292   const PetscInt    *aj,*ai;
293 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
294   PetscInt          i,j;
295 #endif
296 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
297   __m512d           vec_x,vec_y,vec_vals;
298   __m256i           vec_idx,vec_ipos,vec_j;
299   __mmask8           mask;
300 #endif
301 
302   /* Variables that don't appear in MatMult_SeqAIJ. */
303   Mat_SeqAIJPERM    *aijperm = (Mat_SeqAIJPERM*) A->spptr;
304   PetscInt          *iperm;  /* Points to the permutation vector. */
305   PetscInt          *xgroup;
306   /* Denotes where groups of rows with same number of nonzeros
307    * begin and end in iperm. */
308   PetscInt          *nzgroup;
309   PetscInt          ngroup;
310   PetscInt          igroup;
311   PetscInt          jstart,jend;
312   /* jstart is used in loops to denote the position in iperm where a
313    * group starts; jend denotes the position where it ends.
314    * (jend + 1 is where the next group starts.) */
315   PetscInt          iold,nz;
316   PetscInt          istart,iend,isize;
317   PetscInt          ipos;
318   PetscScalar       yp[NDIM];
319   PetscInt          ip[NDIM];    /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
320 
321 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
322 #pragma disjoint(*x,*y,*aa)
323 #endif
324 
325   PetscFunctionBegin;
326   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
327   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
328   aj   = a->j;   /* aj[k] gives column index for element aa[k]. */
329   aa   = a->a; /* Nonzero elements stored row-by-row. */
330   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
331 
332   /* Get the info we need about the permutations and groupings. */
333   iperm   = aijperm->iperm;
334   ngroup  = aijperm->ngroup;
335   xgroup  = aijperm->xgroup;
336   nzgroup = aijperm->nzgroup;
337 
338 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
339   fortranmultaijperm_(&m,x,ii,aj,aa,y);
340 #else
341 
342   for (igroup=0; igroup<ngroup; igroup++) {
343     jstart = xgroup[igroup];
344     jend   = xgroup[igroup+1] - 1;
345     nz     = nzgroup[igroup];
346 
347     /* Handle the special cases where the number of nonzeros per row
348      * in the group is either 0 or 1. */
349     if (nz == 0) {
350       for (i=jstart; i<=jend; i++) {
351         y[iperm[i]] = 0.0;
352       }
353     } else if (nz == 1) {
354       for (i=jstart; i<=jend; i++) {
355         iold    = iperm[i];
356         ipos    = ai[iold];
357         y[iold] = aa[ipos] * x[aj[ipos]];
358       }
359     } else {
360 
361       /* We work our way through the current group in chunks of NDIM rows
362        * at a time. */
363 
364       for (istart=jstart; istart<=jend; istart+=NDIM) {
365         /* Figure out where the chunk of 'isize' rows ends in iperm.
366          * 'isize may of course be less than NDIM for the last chunk. */
367         iend = istart + (NDIM - 1);
368 
369         if (iend > jend) iend = jend;
370 
371         isize = iend - istart + 1;
372 
373         /* Initialize the yp[] array that will be used to hold part of
374          * the permuted results vector, and figure out where in aa each
375          * row of the chunk will begin. */
376         for (i=0; i<isize; i++) {
377           iold = iperm[istart + i];
378           /* iold is a row number from the matrix A *before* reordering. */
379           ip[i] = ai[iold];
380           /* ip[i] tells us where the ith row of the chunk begins in aa. */
381           yp[i] = (PetscScalar) 0.0;
382         }
383 
384         /* If the number of zeros per row exceeds the number of rows in
385          * the chunk, we should vectorize along nz, that is, perform the
386          * mat-vec one row at a time as in the usual CSR case. */
387         if (nz > isize) {
388 #if defined(PETSC_HAVE_CRAY_VECTOR)
389 #pragma _CRI preferstream
390 #endif
391           for (i=0; i<isize; i++) {
392 #if defined(PETSC_HAVE_CRAY_VECTOR)
393 #pragma _CRI prefervector
394 #endif
395 
396 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
397             vec_y = _mm512_setzero_pd();
398             ipos = ip[i];
399             for (j=0; j<(nz>>3); j++) {
400               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
401               vec_vals = _mm512_loadu_pd(&aa[ipos]);
402               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
403               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
404               ipos += 8;
405             }
406             if ((nz&0x07)>2) {
407               mask     = (__mmask8)(0xff >> (8-(nz&0x07)));
408               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
409               vec_vals = _mm512_loadu_pd(&aa[ipos]);
410               vec_x    = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8);
411               vec_y    = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask);
412             } else if ((nz&0x07)==2) {
413               yp[i] += aa[ipos]*x[aj[ipos]];
414               yp[i] += aa[ipos+1]*x[aj[ipos+1]];
415             } else if ((nz&0x07)==1) {
416               yp[i] += aa[ipos]*x[aj[ipos]];
417             }
418             yp[i] += _mm512_reduce_add_pd(vec_y);
419 #else
420             for (j=0; j<nz; j++) {
421               ipos   = ip[i] + j;
422               yp[i] += aa[ipos] * x[aj[ipos]];
423             }
424 #endif
425           }
426         } else {
427           /* Otherwise, there are enough rows in the chunk to make it
428            * worthwhile to vectorize across the rows, that is, to do the
429            * matvec by operating with "columns" of the chunk. */
430           for (j=0; j<nz; j++) {
431 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
432             vec_j = _mm256_set1_epi32(j);
433             for (i=0; i<((isize>>3)<<3); i+=8) {
434               vec_y    = _mm512_loadu_pd(&yp[i]);
435               vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]);
436               vec_ipos = _mm256_add_epi32(vec_ipos,vec_j);
437               vec_idx  = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4);
438               vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8);
439               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
440               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
441               _mm512_storeu_pd(&yp[i],vec_y);
442             }
443             for (i=isize-(isize&0x07); i<isize; i++) {
444               ipos = ip[i]+j;
445               yp[i] += aa[ipos]*x[aj[ipos]];
446             }
447 #else
448             for (i=0; i<isize; i++) {
449               ipos   = ip[i] + j;
450               yp[i] += aa[ipos] * x[aj[ipos]];
451             }
452 #endif
453           }
454         }
455 
456 #if defined(PETSC_HAVE_CRAY_VECTOR)
457 #pragma _CRI ivdep
458 #endif
459         /* Put results from yp[] into non-permuted result vector y. */
460         for (i=0; i<isize; i++) {
461           y[iperm[istart+i]] = yp[i];
462         }
463       } /* End processing chunk of isize rows of a group. */
464     } /* End handling matvec for chunk with nz > 1. */
465   } /* End loop over igroup. */
466 #endif
467   ierr = PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));CHKERRQ(ierr);
468   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
469   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
470   PetscFunctionReturn(0);
471 }
472 
473 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
474  * Note that the names I used to designate the vectors differs from that
475  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
476  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
477 /*
478     I hate having virtually identical code for the mult and the multadd!!!
479 */
480 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)
481 {
482   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
483   const PetscScalar *x;
484   PetscScalar       *y,*w;
485   const MatScalar   *aa;
486   PetscErrorCode    ierr;
487   const PetscInt    *aj,*ai;
488 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
489   PetscInt i,j;
490 #endif
491 
492   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
493   Mat_SeqAIJPERM * aijperm;
494   PetscInt       *iperm;    /* Points to the permutation vector. */
495   PetscInt       *xgroup;
496   /* Denotes where groups of rows with same number of nonzeros
497    * begin and end in iperm. */
498   PetscInt *nzgroup;
499   PetscInt ngroup;
500   PetscInt igroup;
501   PetscInt jstart,jend;
502   /* jstart is used in loops to denote the position in iperm where a
503    * group starts; jend denotes the position where it ends.
504    * (jend + 1 is where the next group starts.) */
505   PetscInt    iold,nz;
506   PetscInt    istart,iend,isize;
507   PetscInt    ipos;
508   PetscScalar yp[NDIM];
509   PetscInt    ip[NDIM];
510   /* yp[] and ip[] are treated as vector "registers" for performing
511    * the mat-vec. */
512 
513 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
514 #pragma disjoint(*x,*y,*aa)
515 #endif
516 
517   PetscFunctionBegin;
518   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
519   ierr = VecGetArrayPair(yy,ww,&y,&w);CHKERRQ(ierr);
520 
521   aj = a->j;   /* aj[k] gives column index for element aa[k]. */
522   aa = a->a;   /* Nonzero elements stored row-by-row. */
523   ai = a->i;   /* ai[k] is the position in aa and aj where row k starts. */
524 
525   /* Get the info we need about the permutations and groupings. */
526   aijperm = (Mat_SeqAIJPERM*) A->spptr;
527   iperm   = aijperm->iperm;
528   ngroup  = aijperm->ngroup;
529   xgroup  = aijperm->xgroup;
530   nzgroup = aijperm->nzgroup;
531 
532 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
533   fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w);
534 #else
535 
536   for (igroup=0; igroup<ngroup; igroup++) {
537     jstart = xgroup[igroup];
538     jend   = xgroup[igroup+1] - 1;
539 
540     nz = nzgroup[igroup];
541 
542     /* Handle the special cases where the number of nonzeros per row
543      * in the group is either 0 or 1. */
544     if (nz == 0) {
545       for (i=jstart; i<=jend; i++) {
546         iold    = iperm[i];
547         y[iold] = w[iold];
548       }
549     }
550     else if (nz == 1) {
551       for (i=jstart; i<=jend; i++) {
552         iold    = iperm[i];
553         ipos    = ai[iold];
554         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
555       }
556     }
557     /* For the general case: */
558     else {
559 
560       /* We work our way through the current group in chunks of NDIM rows
561        * at a time. */
562 
563       for (istart=jstart; istart<=jend; istart+=NDIM) {
564         /* Figure out where the chunk of 'isize' rows ends in iperm.
565          * 'isize may of course be less than NDIM for the last chunk. */
566         iend = istart + (NDIM - 1);
567         if (iend > jend) iend = jend;
568         isize = iend - istart + 1;
569 
570         /* Initialize the yp[] array that will be used to hold part of
571          * the permuted results vector, and figure out where in aa each
572          * row of the chunk will begin. */
573         for (i=0; i<isize; i++) {
574           iold = iperm[istart + i];
575           /* iold is a row number from the matrix A *before* reordering. */
576           ip[i] = ai[iold];
577           /* ip[i] tells us where the ith row of the chunk begins in aa. */
578           yp[i] = w[iold];
579         }
580 
581         /* If the number of zeros per row exceeds the number of rows in
582          * the chunk, we should vectorize along nz, that is, perform the
583          * mat-vec one row at a time as in the usual CSR case. */
584         if (nz > isize) {
585 #if defined(PETSC_HAVE_CRAY_VECTOR)
586 #pragma _CRI preferstream
587 #endif
588           for (i=0; i<isize; i++) {
589 #if defined(PETSC_HAVE_CRAY_VECTOR)
590 #pragma _CRI prefervector
591 #endif
592             for (j=0; j<nz; j++) {
593               ipos   = ip[i] + j;
594               yp[i] += aa[ipos] * x[aj[ipos]];
595             }
596           }
597         }
598         /* Otherwise, there are enough rows in the chunk to make it
599          * worthwhile to vectorize across the rows, that is, to do the
600          * matvec by operating with "columns" of the chunk. */
601         else {
602           for (j=0; j<nz; j++) {
603             for (i=0; i<isize; i++) {
604               ipos   = ip[i] + j;
605               yp[i] += aa[ipos] * x[aj[ipos]];
606             }
607           }
608         }
609 
610 #if defined(PETSC_HAVE_CRAY_VECTOR)
611 #pragma _CRI ivdep
612 #endif
613         /* Put results from yp[] into non-permuted result vector y. */
614         for (i=0; i<isize; i++) {
615           y[iperm[istart+i]] = yp[i];
616         }
617       } /* End processing chunk of isize rows of a group. */
618 
619     } /* End handling matvec for chunk with nz > 1. */
620   } /* End loop over igroup. */
621 
622 #endif
623   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
624   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
625   ierr = VecRestoreArrayPair(yy,ww,&y,&w);CHKERRQ(ierr);
626   PetscFunctionReturn(0);
627 }
628 
629 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
630  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
631  * routine, but can also be used to convert an assembled SeqAIJ matrix
632  * into a SeqAIJPERM one. */
633 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat)
634 {
635   PetscErrorCode ierr;
636   Mat            B = *newmat;
637   Mat_SeqAIJPERM *aijperm;
638   PetscBool      sametype;
639 
640   PetscFunctionBegin;
641   if (reuse == MAT_INITIAL_MATRIX) {
642     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
643   }
644   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
645   if (sametype) PetscFunctionReturn(0);
646 
647   ierr     = PetscNewLog(B,&aijperm);CHKERRQ(ierr);
648   B->spptr = (void*) aijperm;
649 
650   /* Set function pointers for methods that we inherit from AIJ but override. */
651   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
652   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
653   B->ops->destroy     = MatDestroy_SeqAIJPERM;
654   B->ops->mult        = MatMult_SeqAIJPERM;
655   B->ops->multadd     = MatMultAdd_SeqAIJPERM;
656 
657   aijperm->nonzerostate = -1;  /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
658   /* If A has already been assembled, compute the permutation. */
659   if (A->assembled) {
660     ierr = MatSeqAIJPERM_create_perm(B);CHKERRQ(ierr);
661   }
662 
663   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr);
664 
665   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);CHKERRQ(ierr);
666   *newmat = B;
667   PetscFunctionReturn(0);
668 }
669 
670 /*@C
671    MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM.
672    This type inherits from AIJ, but calculates some additional permutation
673    information that is used to allow better vectorization of some
674    operations.  At the cost of increased storage, the AIJ formatted
675    matrix can be copied to a format in which pieces of the matrix are
676    stored in ELLPACK format, allowing the vectorized matrix multiply
677    routine to use stride-1 memory accesses.  As with the AIJ type, it is
678    important to preallocate matrix storage in order to get good assembly
679    performance.
680 
681    Collective
682 
683    Input Parameters:
684 +  comm - MPI communicator, set to PETSC_COMM_SELF
685 .  m - number of rows
686 .  n - number of columns
687 .  nz - number of nonzeros per row (same for all rows)
688 -  nnz - array containing the number of nonzeros in the various rows
689          (possibly different for each row) or NULL
690 
691    Output Parameter:
692 .  A - the matrix
693 
694    Notes:
695    If nnz is given then nz is ignored
696 
697    Level: intermediate
698 
699 .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues()
700 @*/
701 PetscErrorCode  MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
702 {
703   PetscErrorCode ierr;
704 
705   PetscFunctionBegin;
706   ierr = MatCreate(comm,A);CHKERRQ(ierr);
707   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
708   ierr = MatSetType(*A,MATSEQAIJPERM);CHKERRQ(ierr);
709   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
710   PetscFunctionReturn(0);
711 }
712 
713 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
714 {
715   PetscErrorCode ierr;
716 
717   PetscFunctionBegin;
718   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
719   ierr = MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
720   PetscFunctionReturn(0);
721 }
722 
723