xref: /petsc/src/mat/impls/aij/seq/aijperm/aijperm.c (revision b8ced49e4e0eb4a309ccfcd29d1e92cf1cda6377)
1 
2 /*
3   Defines basic operations for the MATSEQAIJPERM matrix class.
4   This class is derived from the MATSEQAIJ class and retains the
5   compressed row storage (aka Yale sparse matrix format) but augments
6   it with some permutation information that enables some operations
7   to be more vectorizable.  A physically rearranged copy of the matrix
8   may be stored if the user desires.
9 
10   Eventually a variety of permutations may be supported.
11 */
12 
13 #include <../src/mat/impls/aij/seq/aij.h>
14 
15 #define NDIM 512
16 /* NDIM specifies how many rows at a time we should work with when
17  * performing the vectorized mat-vec.  This depends on various factors
18  * such as vector register length, etc., and I really need to add a
19  * way for the user (or the library) to tune this.  I'm setting it to
20  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
21  * routines. */
22 
23 typedef struct {
24   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
25 
26   PetscInt         ngroup;
27   PetscInt         *xgroup;
28   /* Denotes where groups of rows with same number of nonzeros
29    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
30    * where the ith group begins. */
31 
32   PetscInt         *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
33   PetscInt         *iperm;  /* The permutation vector. */
34 
35   /* Some of this stuff is for Ed's recursive triangular solve.
36    * I'm not sure what I need yet. */
37   PetscInt         blocksize;
38   PetscInt         nstep;
39   PetscInt         *jstart_list;
40   PetscInt         *jend_list;
41   PetscInt         *action_list;
42   PetscInt         *ngroup_list;
43   PetscInt         **ipointer_list;
44   PetscInt         **xgroup_list;
45   PetscInt         **nzgroup_list;
46   PetscInt         **iperm_list;
47 } Mat_SeqAIJPERM;
48 
49 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
50 {
51   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
52   /* so we will ignore 'MatType type'. */
53   PetscErrorCode ierr;
54   Mat            B       = *newmat;
55   Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr;
56 
57   PetscFunctionBegin;
58   if (reuse == MAT_INITIAL_MATRIX) {
59     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
60     aijperm=(Mat_SeqAIJPERM*)B->spptr;
61   }
62 
63   /* Reset the original function pointers. */
64   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
65   B->ops->destroy     = MatDestroy_SeqAIJ;
66   B->ops->duplicate   = MatDuplicate_SeqAIJ;
67   B->ops->mult        = MatMult_SeqAIJ;
68   B->ops->multadd     = MatMultAdd_SeqAIJ;
69 
70   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);CHKERRQ(ierr);
71   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr);
72   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr);
73   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr);
74 
75   /* Free everything in the Mat_SeqAIJPERM data structure.*/
76   ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
77   ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
78   ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
79   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
80 
81   /* Change the type of B to MATSEQAIJ. */
82   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
83 
84   *newmat = B;
85   PetscFunctionReturn(0);
86 }
87 
88 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
89 {
90   PetscErrorCode ierr;
91   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
92 
93   PetscFunctionBegin;
94   if (aijperm) {
95     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
96     ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
97     ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
98     ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
99     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
100   }
101   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
102    * to destroy everything that remains. */
103   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
104   /* Note that I don't call MatSetType().  I believe this is because that
105    * is only to be called when *building* a matrix.  I could be wrong, but
106    * that is how things work for the SuperLU matrix class. */
107   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
108   PetscFunctionReturn(0);
109 }
110 
111 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
112 {
113   PetscErrorCode ierr;
114   Mat_SeqAIJPERM *aijperm      = (Mat_SeqAIJPERM*) A->spptr;
115   Mat_SeqAIJPERM *aijperm_dest;
116   PetscBool      perm;
117 
118   PetscFunctionBegin;
119   ierr        = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
120   ierr        = PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);CHKERRQ(ierr);
121   if (perm) {
122     aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr;
123     ierr = PetscFree(aijperm_dest->xgroup);CHKERRQ(ierr);
124     ierr = PetscFree(aijperm_dest->nzgroup);CHKERRQ(ierr);
125     ierr = PetscFree(aijperm_dest->iperm);CHKERRQ(ierr);
126   } else {
127     ierr        = PetscNewLog(*M,&aijperm_dest);CHKERRQ(ierr);
128     (*M)->spptr = (void*) aijperm_dest;
129     ierr        = PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);CHKERRQ(ierr);
130     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr);
131     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
132     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
133     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
134   }
135   ierr        = PetscMemcpy(aijperm_dest,aijperm,sizeof(Mat_SeqAIJPERM));CHKERRQ(ierr);
136   /* Allocate space for, and copy the grouping and permutation info.
137    * I note that when the groups are initially determined in
138    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
139    * necessary.  But at this point, we know how large they need to be, and
140    * allocate only the necessary amount of memory.  So the duplicated matrix
141    * may actually use slightly less storage than the original! */
142   ierr = PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);CHKERRQ(ierr);
143   ierr = PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);CHKERRQ(ierr);
144   ierr = PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);CHKERRQ(ierr);
145   ierr = PetscMemcpy(aijperm_dest->iperm,aijperm->iperm,sizeof(PetscInt)*A->rmap->n);CHKERRQ(ierr);
146   ierr = PetscMemcpy(aijperm_dest->xgroup,aijperm->xgroup,sizeof(PetscInt)*(aijperm->ngroup+1));CHKERRQ(ierr);
147   ierr = PetscMemcpy(aijperm_dest->nzgroup,aijperm->nzgroup,sizeof(PetscInt)*aijperm->ngroup);CHKERRQ(ierr);
148   PetscFunctionReturn(0);
149 }
150 
151 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
152 {
153   PetscErrorCode ierr;
154   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)(A)->data;
155   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
156   PetscInt       m;       /* Number of rows in the matrix. */
157   PetscInt       *ia;       /* From the CSR representation; points to the beginning  of each row. */
158   PetscInt       maxnz;      /* Maximum number of nonzeros in any row. */
159   PetscInt       *rows_in_bucket;
160   /* To construct the permutation, we sort each row into one of maxnz
161    * buckets based on how many nonzeros are in the row. */
162   PetscInt       nz;
163   PetscInt       *nz_in_row;         /* the number of nonzero elements in row k. */
164   PetscInt       *ipnz;
165   /* When constructing the iperm permutation vector,
166    * ipnz[nz] is used to point to the next place in the permutation vector
167    * that a row with nz nonzero elements should be placed.*/
168   PetscInt       i, ngroup, istart, ipos;
169 
170   PetscFunctionBegin;
171   if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */
172   aijperm->nonzerostate = A->nonzerostate;
173  /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
174   ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
175   ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
176   ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
177 
178   m  = A->rmap->n;
179   ia = a->i;
180 
181   /* Allocate the arrays that will hold the permutation vector. */
182   ierr = PetscMalloc1(m, &aijperm->iperm);CHKERRQ(ierr);
183 
184   /* Allocate some temporary work arrays that will be used in
185    * calculating the permuation vector and groupings. */
186   ierr = PetscMalloc1(m, &nz_in_row);CHKERRQ(ierr);
187 
188   /* Now actually figure out the permutation and grouping. */
189 
190   /* First pass: Determine number of nonzeros in each row, maximum
191    * number of nonzeros in any row, and how many rows fall into each
192    * "bucket" of rows with same number of nonzeros. */
193   maxnz = 0;
194   for (i=0; i<m; i++) {
195     nz_in_row[i] = ia[i+1]-ia[i];
196     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
197   }
198   ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);CHKERRQ(ierr);
199   ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);CHKERRQ(ierr);
200 
201   for (i=0; i<=maxnz; i++) {
202     rows_in_bucket[i] = 0;
203   }
204   for (i=0; i<m; i++) {
205     nz = nz_in_row[i];
206     rows_in_bucket[nz]++;
207   }
208 
209   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
210    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
211    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
212    * then xgroup[] must consist of (maxnz + 2) elements, since the last
213    * element of xgroup will tell us where the (maxnz + 1)th group ends.
214    * We allocate space for the maximum number of groups;
215    * that is potentially a little wasteful, but not too much so.
216    * Perhaps I should fix it later. */
217   ierr = PetscMalloc1(maxnz+2, &aijperm->xgroup);CHKERRQ(ierr);
218   ierr = PetscMalloc1(maxnz+1, &aijperm->nzgroup);CHKERRQ(ierr);
219 
220   /* Second pass.  Look at what is in the buckets and create the groupings.
221    * Note that it is OK to have a group of rows with no non-zero values. */
222   ngroup = 0;
223   istart = 0;
224   for (i=0; i<=maxnz; i++) {
225     if (rows_in_bucket[i] > 0) {
226       aijperm->nzgroup[ngroup] = i;
227       aijperm->xgroup[ngroup]  = istart;
228       ngroup++;
229       istart += rows_in_bucket[i];
230     }
231   }
232 
233   aijperm->xgroup[ngroup] = istart;
234   aijperm->ngroup         = ngroup;
235 
236   /* Now fill in the permutation vector iperm. */
237   ipnz[0] = 0;
238   for (i=0; i<maxnz; i++) {
239     ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
240   }
241 
242   for (i=0; i<m; i++) {
243     nz                   = nz_in_row[i];
244     ipos                 = ipnz[nz];
245     aijperm->iperm[ipos] = i;
246     ipnz[nz]++;
247   }
248 
249   /* Clean up temporary work arrays. */
250   ierr = PetscFree(rows_in_bucket);CHKERRQ(ierr);
251   ierr = PetscFree(ipnz);CHKERRQ(ierr);
252   ierr = PetscFree(nz_in_row);CHKERRQ(ierr);
253   PetscFunctionReturn(0);
254 }
255 
256 
257 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
258 {
259   PetscErrorCode ierr;
260   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
261 
262   PetscFunctionBegin;
263   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
264 
265   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
266    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
267    * I'm not sure if this is the best way to do this, but it avoids
268    * a lot of code duplication.
269    * I also note that currently MATSEQAIJPERM doesn't know anything about
270    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
271    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
272    * this, this may break things.  (Don't know... haven't looked at it.) */
273   a->inode.use = PETSC_FALSE;
274   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
275 
276   /* Now calculate the permutation and grouping information. */
277   ierr = MatSeqAIJPERM_create_perm(A);CHKERRQ(ierr);
278   PetscFunctionReturn(0);
279 }
280 
281 PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)
282 {
283   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
284   const PetscScalar *x;
285   PetscScalar       *y;
286   const MatScalar   *aa;
287   PetscErrorCode    ierr;
288   const PetscInt    *aj,*ai;
289 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
290   PetscInt          i,j;
291 #endif
292 
293   /* Variables that don't appear in MatMult_SeqAIJ. */
294   Mat_SeqAIJPERM    *aijperm = (Mat_SeqAIJPERM*) A->spptr;
295   PetscInt          *iperm;  /* Points to the permutation vector. */
296   PetscInt          *xgroup;
297   /* Denotes where groups of rows with same number of nonzeros
298    * begin and end in iperm. */
299   PetscInt          *nzgroup;
300   PetscInt          ngroup;
301   PetscInt          igroup;
302   PetscInt          jstart,jend;
303   /* jstart is used in loops to denote the position in iperm where a
304    * group starts; jend denotes the position where it ends.
305    * (jend + 1 is where the next group starts.) */
306   PetscInt          iold,nz;
307   PetscInt          istart,iend,isize;
308   PetscInt          ipos;
309   PetscScalar       yp[NDIM];
310   PetscInt          ip[NDIM];    /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
311 
312 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
313 #pragma disjoint(*x,*y,*aa)
314 #endif
315 
316   PetscFunctionBegin;
317   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
318   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
319   aj   = a->j;   /* aj[k] gives column index for element aa[k]. */
320   aa   = a->a; /* Nonzero elements stored row-by-row. */
321   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
322 
323   /* Get the info we need about the permutations and groupings. */
324   iperm   = aijperm->iperm;
325   ngroup  = aijperm->ngroup;
326   xgroup  = aijperm->xgroup;
327   nzgroup = aijperm->nzgroup;
328 
329 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
330   fortranmultaijperm_(&m,x,ii,aj,aa,y);
331 #else
332 
333   for (igroup=0; igroup<ngroup; igroup++) {
334     jstart = xgroup[igroup];
335     jend   = xgroup[igroup+1] - 1;
336     nz     = nzgroup[igroup];
337 
338     /* Handle the special cases where the number of nonzeros per row
339      * in the group is either 0 or 1. */
340     if (nz == 0) {
341       for (i=jstart; i<=jend; i++) {
342         y[iperm[i]] = 0.0;
343       }
344     } else if (nz == 1) {
345       for (i=jstart; i<=jend; i++) {
346         iold    = iperm[i];
347         ipos    = ai[iold];
348         y[iold] = aa[ipos] * x[aj[ipos]];
349       }
350     } else {
351 
352       /* We work our way through the current group in chunks of NDIM rows
353        * at a time. */
354 
355       for (istart=jstart; istart<=jend; istart+=NDIM) {
356         /* Figure out where the chunk of 'isize' rows ends in iperm.
357          * 'isize may of course be less than NDIM for the last chunk. */
358         iend = istart + (NDIM - 1);
359 
360         if (iend > jend) iend = jend;
361 
362         isize = iend - istart + 1;
363 
364         /* Initialize the yp[] array that will be used to hold part of
365          * the permuted results vector, and figure out where in aa each
366          * row of the chunk will begin. */
367         for (i=0; i<isize; i++) {
368           iold = iperm[istart + i];
369           /* iold is a row number from the matrix A *before* reordering. */
370           ip[i] = ai[iold];
371           /* ip[i] tells us where the ith row of the chunk begins in aa. */
372           yp[i] = (PetscScalar) 0.0;
373         }
374 
375         /* If the number of zeros per row exceeds the number of rows in
376          * the chunk, we should vectorize along nz, that is, perform the
377          * mat-vec one row at a time as in the usual CSR case. */
378         if (nz > isize) {
379 #if defined(PETSC_HAVE_CRAY_VECTOR)
380 #pragma _CRI preferstream
381 #endif
382           for (i=0; i<isize; i++) {
383 #if defined(PETSC_HAVE_CRAY_VECTOR)
384 #pragma _CRI prefervector
385 #endif
386             for (j=0; j<nz; j++) {
387               ipos   = ip[i] + j;
388               yp[i] += aa[ipos] * x[aj[ipos]];
389             }
390           }
391         } else {
392           /* Otherwise, there are enough rows in the chunk to make it
393            * worthwhile to vectorize across the rows, that is, to do the
394            * matvec by operating with "columns" of the chunk. */
395           for (j=0; j<nz; j++) {
396             for (i=0; i<isize; i++) {
397               ipos   = ip[i] + j;
398               yp[i] += aa[ipos] * x[aj[ipos]];
399             }
400           }
401         }
402 
403 #if defined(PETSC_HAVE_CRAY_VECTOR)
404 #pragma _CRI ivdep
405 #endif
406         /* Put results from yp[] into non-permuted result vector y. */
407         for (i=0; i<isize; i++) {
408           y[iperm[istart+i]] = yp[i];
409         }
410       } /* End processing chunk of isize rows of a group. */
411     } /* End handling matvec for chunk with nz > 1. */
412   } /* End loop over igroup. */
413 #endif
414   ierr = PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));CHKERRQ(ierr);
415   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
416   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
417   PetscFunctionReturn(0);
418 }
419 
420 
421 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
422  * Note that the names I used to designate the vectors differs from that
423  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
424  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
425 /*
426     I hate having virtually identical code for the mult and the multadd!!!
427 */
428 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)
429 {
430   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
431   const PetscScalar *x;
432   PetscScalar       *y,*w;
433   const MatScalar   *aa;
434   PetscErrorCode    ierr;
435   const PetscInt    *aj,*ai;
436 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
437   PetscInt i,j;
438 #endif
439 
440   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
441   Mat_SeqAIJPERM * aijperm;
442   PetscInt       *iperm;    /* Points to the permutation vector. */
443   PetscInt       *xgroup;
444   /* Denotes where groups of rows with same number of nonzeros
445    * begin and end in iperm. */
446   PetscInt *nzgroup;
447   PetscInt ngroup;
448   PetscInt igroup;
449   PetscInt jstart,jend;
450   /* jstart is used in loops to denote the position in iperm where a
451    * group starts; jend denotes the position where it ends.
452    * (jend + 1 is where the next group starts.) */
453   PetscInt    iold,nz;
454   PetscInt    istart,iend,isize;
455   PetscInt    ipos;
456   PetscScalar yp[NDIM];
457   PetscInt    ip[NDIM];
458   /* yp[] and ip[] are treated as vector "registers" for performing
459    * the mat-vec. */
460 
461 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
462 #pragma disjoint(*x,*y,*aa)
463 #endif
464 
465   PetscFunctionBegin;
466   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
467   ierr = VecGetArrayPair(yy,ww,&y,&w);CHKERRQ(ierr);
468 
469   aj = a->j;   /* aj[k] gives column index for element aa[k]. */
470   aa = a->a;   /* Nonzero elements stored row-by-row. */
471   ai = a->i;   /* ai[k] is the position in aa and aj where row k starts. */
472 
473   /* Get the info we need about the permutations and groupings. */
474   aijperm = (Mat_SeqAIJPERM*) A->spptr;
475   iperm   = aijperm->iperm;
476   ngroup  = aijperm->ngroup;
477   xgroup  = aijperm->xgroup;
478   nzgroup = aijperm->nzgroup;
479 
480 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
481   fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w);
482 #else
483 
484   for (igroup=0; igroup<ngroup; igroup++) {
485     jstart = xgroup[igroup];
486     jend   = xgroup[igroup+1] - 1;
487 
488     nz = nzgroup[igroup];
489 
490     /* Handle the special cases where the number of nonzeros per row
491      * in the group is either 0 or 1. */
492     if (nz == 0) {
493       for (i=jstart; i<=jend; i++) {
494         iold    = iperm[i];
495         y[iold] = w[iold];
496       }
497     }
498     else if (nz == 1) {
499       for (i=jstart; i<=jend; i++) {
500         iold    = iperm[i];
501         ipos    = ai[iold];
502         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
503       }
504     }
505     /* For the general case: */
506     else {
507 
508       /* We work our way through the current group in chunks of NDIM rows
509        * at a time. */
510 
511       for (istart=jstart; istart<=jend; istart+=NDIM) {
512         /* Figure out where the chunk of 'isize' rows ends in iperm.
513          * 'isize may of course be less than NDIM for the last chunk. */
514         iend = istart + (NDIM - 1);
515         if (iend > jend) iend = jend;
516         isize = iend - istart + 1;
517 
518         /* Initialize the yp[] array that will be used to hold part of
519          * the permuted results vector, and figure out where in aa each
520          * row of the chunk will begin. */
521         for (i=0; i<isize; i++) {
522           iold = iperm[istart + i];
523           /* iold is a row number from the matrix A *before* reordering. */
524           ip[i] = ai[iold];
525           /* ip[i] tells us where the ith row of the chunk begins in aa. */
526           yp[i] = w[iold];
527         }
528 
529         /* If the number of zeros per row exceeds the number of rows in
530          * the chunk, we should vectorize along nz, that is, perform the
531          * mat-vec one row at a time as in the usual CSR case. */
532         if (nz > isize) {
533 #if defined(PETSC_HAVE_CRAY_VECTOR)
534 #pragma _CRI preferstream
535 #endif
536           for (i=0; i<isize; i++) {
537 #if defined(PETSC_HAVE_CRAY_VECTOR)
538 #pragma _CRI prefervector
539 #endif
540             for (j=0; j<nz; j++) {
541               ipos   = ip[i] + j;
542               yp[i] += aa[ipos] * x[aj[ipos]];
543             }
544           }
545         }
546         /* Otherwise, there are enough rows in the chunk to make it
547          * worthwhile to vectorize across the rows, that is, to do the
548          * matvec by operating with "columns" of the chunk. */
549         else {
550           for (j=0; j<nz; j++) {
551             for (i=0; i<isize; i++) {
552               ipos   = ip[i] + j;
553               yp[i] += aa[ipos] * x[aj[ipos]];
554             }
555           }
556         }
557 
558 #if defined(PETSC_HAVE_CRAY_VECTOR)
559 #pragma _CRI ivdep
560 #endif
561         /* Put results from yp[] into non-permuted result vector y. */
562         for (i=0; i<isize; i++) {
563           y[iperm[istart+i]] = yp[i];
564         }
565       } /* End processing chunk of isize rows of a group. */
566 
567     } /* End handling matvec for chunk with nz > 1. */
568   } /* End loop over igroup. */
569 
570 #endif
571   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
572   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
573   ierr = VecRestoreArrayPair(yy,ww,&y,&w);CHKERRQ(ierr);
574   PetscFunctionReturn(0);
575 }
576 
577 
578 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
579  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
580  * routine, but can also be used to convert an assembled SeqAIJ matrix
581  * into a SeqAIJPERM one. */
582 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat)
583 {
584   PetscErrorCode ierr;
585   Mat            B = *newmat;
586   Mat_SeqAIJPERM *aijperm;
587   PetscBool      sametype;
588 
589   PetscFunctionBegin;
590   if (reuse == MAT_INITIAL_MATRIX) {
591     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
592   }
593   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
594   if (sametype) PetscFunctionReturn(0);
595 
596   ierr     = PetscNewLog(B,&aijperm);CHKERRQ(ierr);
597   B->spptr = (void*) aijperm;
598 
599   /* Set function pointers for methods that we inherit from AIJ but override. */
600   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
601   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
602   B->ops->destroy     = MatDestroy_SeqAIJPERM;
603   B->ops->mult        = MatMult_SeqAIJPERM;
604   B->ops->multadd     = MatMultAdd_SeqAIJPERM;
605 
606   aijperm->nonzerostate = -1;  /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
607   /* If A has already been assembled, compute the permutation. */
608   if (A->assembled) {
609     ierr = MatSeqAIJPERM_create_perm(B);CHKERRQ(ierr);
610   }
611 
612   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr);
613   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
614   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
615   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
616 
617   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);CHKERRQ(ierr);
618   *newmat = B;
619   PetscFunctionReturn(0);
620 }
621 
622 /*@C
623    MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM.
624    This type inherits from AIJ, but calculates some additional permutation
625    information that is used to allow better vectorization of some
626    operations.  At the cost of increased storage, the AIJ formatted
627    matrix can be copied to a format in which pieces of the matrix are
628    stored in ELLPACK format, allowing the vectorized matrix multiply
629    routine to use stride-1 memory accesses.  As with the AIJ type, it is
630    important to preallocate matrix storage in order to get good assembly
631    performance.
632 
633    Collective on MPI_Comm
634 
635    Input Parameters:
636 +  comm - MPI communicator, set to PETSC_COMM_SELF
637 .  m - number of rows
638 .  n - number of columns
639 .  nz - number of nonzeros per row (same for all rows)
640 -  nnz - array containing the number of nonzeros in the various rows
641          (possibly different for each row) or NULL
642 
643    Output Parameter:
644 .  A - the matrix
645 
646    Notes:
647    If nnz is given then nz is ignored
648 
649    Level: intermediate
650 
651 .keywords: matrix, cray, sparse, parallel
652 
653 .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues()
654 @*/
655 PetscErrorCode  MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
656 {
657   PetscErrorCode ierr;
658 
659   PetscFunctionBegin;
660   ierr = MatCreate(comm,A);CHKERRQ(ierr);
661   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
662   ierr = MatSetType(*A,MATSEQAIJPERM);CHKERRQ(ierr);
663   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
664   PetscFunctionReturn(0);
665 }
666 
667 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
668 {
669   PetscErrorCode ierr;
670 
671   PetscFunctionBegin;
672   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
673   ierr = MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
674   PetscFunctionReturn(0);
675 }
676 
677