xref: /petsc/src/mat/impls/aij/seq/aijperm/aijperm.c (revision 80423c7a917f90e59f152b5ea0c4cb4e461b0011)
1938d9b04SRichard Tran Mills 
2938d9b04SRichard Tran Mills /*
3938d9b04SRichard Tran Mills   Defines basic operations for the MATSEQAIJPERM matrix class.
4938d9b04SRichard Tran Mills   This class is derived from the MATSEQAIJ class and retains the
5938d9b04SRichard Tran Mills   compressed row storage (aka Yale sparse matrix format) but augments
6938d9b04SRichard Tran Mills   it with some permutation information that enables some operations
7938d9b04SRichard Tran Mills   to be more vectorizable.  A physically rearranged copy of the matrix
8938d9b04SRichard Tran Mills   may be stored if the user desires.
9938d9b04SRichard Tran Mills 
10938d9b04SRichard Tran Mills   Eventually a variety of permutations may be supported.
11938d9b04SRichard Tran Mills */
12938d9b04SRichard Tran Mills 
13938d9b04SRichard Tran Mills #include <../src/mat/impls/aij/seq/aij.h>
14938d9b04SRichard Tran Mills 
15*80423c7aSSatish Balay #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16f67b6f2eSRichard Tran Mills #include <immintrin.h>
17f67b6f2eSRichard Tran Mills 
18f67b6f2eSRichard Tran Mills #if !defined(_MM_SCALE_8)
19f67b6f2eSRichard Tran Mills #define _MM_SCALE_8    8
20f67b6f2eSRichard Tran Mills #endif
21f67b6f2eSRichard Tran Mills #if !defined(_MM_SCALE_4)
22f67b6f2eSRichard Tran Mills #define _MM_SCALE_4    4
23f67b6f2eSRichard Tran Mills #endif
24f67b6f2eSRichard Tran Mills #endif
25f67b6f2eSRichard Tran Mills 
26938d9b04SRichard Tran Mills #define NDIM 512
27938d9b04SRichard Tran Mills /* NDIM specifies how many rows at a time we should work with when
28938d9b04SRichard Tran Mills  * performing the vectorized mat-vec.  This depends on various factors
29938d9b04SRichard Tran Mills  * such as vector register length, etc., and I really need to add a
30938d9b04SRichard Tran Mills  * way for the user (or the library) to tune this.  I'm setting it to
31938d9b04SRichard Tran Mills  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32938d9b04SRichard Tran Mills  * routines. */
33938d9b04SRichard Tran Mills 
34938d9b04SRichard Tran Mills typedef struct {
35938d9b04SRichard Tran Mills   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
36938d9b04SRichard Tran Mills 
37938d9b04SRichard Tran Mills   PetscInt         ngroup;
38938d9b04SRichard Tran Mills   PetscInt         *xgroup;
39938d9b04SRichard Tran Mills   /* Denotes where groups of rows with same number of nonzeros
40938d9b04SRichard Tran Mills    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41938d9b04SRichard Tran Mills    * where the ith group begins. */
42938d9b04SRichard Tran Mills 
43938d9b04SRichard Tran Mills   PetscInt         *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
44938d9b04SRichard Tran Mills   PetscInt         *iperm;  /* The permutation vector. */
45938d9b04SRichard Tran Mills 
46938d9b04SRichard Tran Mills   /* Some of this stuff is for Ed's recursive triangular solve.
47938d9b04SRichard Tran Mills    * I'm not sure what I need yet. */
48938d9b04SRichard Tran Mills   PetscInt         blocksize;
49938d9b04SRichard Tran Mills   PetscInt         nstep;
50938d9b04SRichard Tran Mills   PetscInt         *jstart_list;
51938d9b04SRichard Tran Mills   PetscInt         *jend_list;
52938d9b04SRichard Tran Mills   PetscInt         *action_list;
53938d9b04SRichard Tran Mills   PetscInt         *ngroup_list;
54938d9b04SRichard Tran Mills   PetscInt         **ipointer_list;
55938d9b04SRichard Tran Mills   PetscInt         **xgroup_list;
56938d9b04SRichard Tran Mills   PetscInt         **nzgroup_list;
57938d9b04SRichard Tran Mills   PetscInt         **iperm_list;
58938d9b04SRichard Tran Mills } Mat_SeqAIJPERM;
59938d9b04SRichard Tran Mills 
60938d9b04SRichard Tran Mills PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
61938d9b04SRichard Tran Mills {
62938d9b04SRichard Tran Mills   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
63938d9b04SRichard Tran Mills   /* so we will ignore 'MatType type'. */
64938d9b04SRichard Tran Mills   PetscErrorCode ierr;
65938d9b04SRichard Tran Mills   Mat            B       = *newmat;
66938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr;
67938d9b04SRichard Tran Mills 
68938d9b04SRichard Tran Mills   PetscFunctionBegin;
69938d9b04SRichard Tran Mills   if (reuse == MAT_INITIAL_MATRIX) {
70938d9b04SRichard Tran Mills     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
71938d9b04SRichard Tran Mills     aijperm=(Mat_SeqAIJPERM*)B->spptr;
72938d9b04SRichard Tran Mills   }
73938d9b04SRichard Tran Mills 
74938d9b04SRichard Tran Mills   /* Reset the original function pointers. */
75938d9b04SRichard Tran Mills   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
76938d9b04SRichard Tran Mills   B->ops->destroy     = MatDestroy_SeqAIJ;
77938d9b04SRichard Tran Mills   B->ops->duplicate   = MatDuplicate_SeqAIJ;
78938d9b04SRichard Tran Mills   B->ops->mult        = MatMult_SeqAIJ;
79938d9b04SRichard Tran Mills   B->ops->multadd     = MatMultAdd_SeqAIJ;
80938d9b04SRichard Tran Mills 
81938d9b04SRichard Tran Mills   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);CHKERRQ(ierr);
82938d9b04SRichard Tran Mills   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr);
83938d9b04SRichard Tran Mills   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr);
84938d9b04SRichard Tran Mills   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr);
850d26a50cSRichard Tran Mills   ierr = PetscObjectComposeFunction((PetscObject)B,"MatPtAP_is_seqaijperm_C",NULL);CHKERRQ(ierr);
86938d9b04SRichard Tran Mills 
87938d9b04SRichard Tran Mills   /* Free everything in the Mat_SeqAIJPERM data structure.*/
88938d9b04SRichard Tran Mills   ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
89938d9b04SRichard Tran Mills   ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
90938d9b04SRichard Tran Mills   ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
91938d9b04SRichard Tran Mills   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
92938d9b04SRichard Tran Mills 
93938d9b04SRichard Tran Mills   /* Change the type of B to MATSEQAIJ. */
94938d9b04SRichard Tran Mills   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
95938d9b04SRichard Tran Mills 
96938d9b04SRichard Tran Mills   *newmat = B;
97938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
98938d9b04SRichard Tran Mills }
99938d9b04SRichard Tran Mills 
100938d9b04SRichard Tran Mills PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
101938d9b04SRichard Tran Mills {
102938d9b04SRichard Tran Mills   PetscErrorCode ierr;
103938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
104938d9b04SRichard Tran Mills 
105938d9b04SRichard Tran Mills   PetscFunctionBegin;
106938d9b04SRichard Tran Mills   if (aijperm) {
107938d9b04SRichard Tran Mills     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
108938d9b04SRichard Tran Mills     ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
109938d9b04SRichard Tran Mills     ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
110938d9b04SRichard Tran Mills     ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
111938d9b04SRichard Tran Mills     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
112938d9b04SRichard Tran Mills   }
113938d9b04SRichard Tran Mills   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
114938d9b04SRichard Tran Mills    * to destroy everything that remains. */
115938d9b04SRichard Tran Mills   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
116938d9b04SRichard Tran Mills   /* Note that I don't call MatSetType().  I believe this is because that
117938d9b04SRichard Tran Mills    * is only to be called when *building* a matrix.  I could be wrong, but
118938d9b04SRichard Tran Mills    * that is how things work for the SuperLU matrix class. */
119938d9b04SRichard Tran Mills   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
120938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
121938d9b04SRichard Tran Mills }
122938d9b04SRichard Tran Mills 
123938d9b04SRichard Tran Mills PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
124938d9b04SRichard Tran Mills {
125938d9b04SRichard Tran Mills   PetscErrorCode ierr;
126938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm      = (Mat_SeqAIJPERM*) A->spptr;
127938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm_dest;
128938d9b04SRichard Tran Mills   PetscBool      perm;
129938d9b04SRichard Tran Mills 
130938d9b04SRichard Tran Mills   PetscFunctionBegin;
131938d9b04SRichard Tran Mills   ierr        = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
132938d9b04SRichard Tran Mills   ierr        = PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);CHKERRQ(ierr);
133938d9b04SRichard Tran Mills   if (perm) {
134938d9b04SRichard Tran Mills     aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr;
135938d9b04SRichard Tran Mills     ierr = PetscFree(aijperm_dest->xgroup);CHKERRQ(ierr);
136938d9b04SRichard Tran Mills     ierr = PetscFree(aijperm_dest->nzgroup);CHKERRQ(ierr);
137938d9b04SRichard Tran Mills     ierr = PetscFree(aijperm_dest->iperm);CHKERRQ(ierr);
138938d9b04SRichard Tran Mills   } else {
139938d9b04SRichard Tran Mills     ierr        = PetscNewLog(*M,&aijperm_dest);CHKERRQ(ierr);
140938d9b04SRichard Tran Mills     (*M)->spptr = (void*) aijperm_dest;
141938d9b04SRichard Tran Mills     ierr        = PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);CHKERRQ(ierr);
142938d9b04SRichard Tran Mills     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr);
143938d9b04SRichard Tran Mills     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
144938d9b04SRichard Tran Mills     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
145938d9b04SRichard Tran Mills     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
146938d9b04SRichard Tran Mills   }
147580bdb30SBarry Smith   ierr        = PetscArraycpy(aijperm_dest,aijperm,1);CHKERRQ(ierr);
148938d9b04SRichard Tran Mills   /* Allocate space for, and copy the grouping and permutation info.
149938d9b04SRichard Tran Mills    * I note that when the groups are initially determined in
150938d9b04SRichard Tran Mills    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
151938d9b04SRichard Tran Mills    * necessary.  But at this point, we know how large they need to be, and
152938d9b04SRichard Tran Mills    * allocate only the necessary amount of memory.  So the duplicated matrix
153938d9b04SRichard Tran Mills    * may actually use slightly less storage than the original! */
154938d9b04SRichard Tran Mills   ierr = PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);CHKERRQ(ierr);
155938d9b04SRichard Tran Mills   ierr = PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);CHKERRQ(ierr);
156938d9b04SRichard Tran Mills   ierr = PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);CHKERRQ(ierr);
157580bdb30SBarry Smith   ierr = PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n);CHKERRQ(ierr);
158580bdb30SBarry Smith   ierr = PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1);CHKERRQ(ierr);
159580bdb30SBarry Smith   ierr = PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup);CHKERRQ(ierr);
160938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
161938d9b04SRichard Tran Mills }
162938d9b04SRichard Tran Mills 
163938d9b04SRichard Tran Mills PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
164938d9b04SRichard Tran Mills {
165938d9b04SRichard Tran Mills   PetscErrorCode ierr;
166938d9b04SRichard Tran Mills   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)(A)->data;
167938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
168938d9b04SRichard Tran Mills   PetscInt       m;       /* Number of rows in the matrix. */
169938d9b04SRichard Tran Mills   PetscInt       *ia;       /* From the CSR representation; points to the beginning  of each row. */
170938d9b04SRichard Tran Mills   PetscInt       maxnz;      /* Maximum number of nonzeros in any row. */
171938d9b04SRichard Tran Mills   PetscInt       *rows_in_bucket;
172938d9b04SRichard Tran Mills   /* To construct the permutation, we sort each row into one of maxnz
173938d9b04SRichard Tran Mills    * buckets based on how many nonzeros are in the row. */
174938d9b04SRichard Tran Mills   PetscInt       nz;
175938d9b04SRichard Tran Mills   PetscInt       *nz_in_row;         /* the number of nonzero elements in row k. */
176938d9b04SRichard Tran Mills   PetscInt       *ipnz;
177938d9b04SRichard Tran Mills   /* When constructing the iperm permutation vector,
178938d9b04SRichard Tran Mills    * ipnz[nz] is used to point to the next place in the permutation vector
179938d9b04SRichard Tran Mills    * that a row with nz nonzero elements should be placed.*/
180938d9b04SRichard Tran Mills   PetscInt       i, ngroup, istart, ipos;
181938d9b04SRichard Tran Mills 
182938d9b04SRichard Tran Mills   PetscFunctionBegin;
183938d9b04SRichard Tran Mills   if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */
184938d9b04SRichard Tran Mills   aijperm->nonzerostate = A->nonzerostate;
185938d9b04SRichard Tran Mills  /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
186938d9b04SRichard Tran Mills   ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
187938d9b04SRichard Tran Mills   ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
188938d9b04SRichard Tran Mills   ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
189938d9b04SRichard Tran Mills 
190938d9b04SRichard Tran Mills   m  = A->rmap->n;
191938d9b04SRichard Tran Mills   ia = a->i;
192938d9b04SRichard Tran Mills 
193938d9b04SRichard Tran Mills   /* Allocate the arrays that will hold the permutation vector. */
194938d9b04SRichard Tran Mills   ierr = PetscMalloc1(m, &aijperm->iperm);CHKERRQ(ierr);
195938d9b04SRichard Tran Mills 
196938d9b04SRichard Tran Mills   /* Allocate some temporary work arrays that will be used in
197938d9b04SRichard Tran Mills    * calculating the permuation vector and groupings. */
198938d9b04SRichard Tran Mills   ierr = PetscMalloc1(m, &nz_in_row);CHKERRQ(ierr);
199938d9b04SRichard Tran Mills 
200938d9b04SRichard Tran Mills   /* Now actually figure out the permutation and grouping. */
201938d9b04SRichard Tran Mills 
202938d9b04SRichard Tran Mills   /* First pass: Determine number of nonzeros in each row, maximum
203938d9b04SRichard Tran Mills    * number of nonzeros in any row, and how many rows fall into each
204938d9b04SRichard Tran Mills    * "bucket" of rows with same number of nonzeros. */
205938d9b04SRichard Tran Mills   maxnz = 0;
206938d9b04SRichard Tran Mills   for (i=0; i<m; i++) {
207938d9b04SRichard Tran Mills     nz_in_row[i] = ia[i+1]-ia[i];
208938d9b04SRichard Tran Mills     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
209938d9b04SRichard Tran Mills   }
210938d9b04SRichard Tran Mills   ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);CHKERRQ(ierr);
211938d9b04SRichard Tran Mills   ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);CHKERRQ(ierr);
212938d9b04SRichard Tran Mills 
213938d9b04SRichard Tran Mills   for (i=0; i<=maxnz; i++) {
214938d9b04SRichard Tran Mills     rows_in_bucket[i] = 0;
215938d9b04SRichard Tran Mills   }
216938d9b04SRichard Tran Mills   for (i=0; i<m; i++) {
217938d9b04SRichard Tran Mills     nz = nz_in_row[i];
218938d9b04SRichard Tran Mills     rows_in_bucket[nz]++;
219938d9b04SRichard Tran Mills   }
220938d9b04SRichard Tran Mills 
221938d9b04SRichard Tran Mills   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
222938d9b04SRichard Tran Mills    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
223938d9b04SRichard Tran Mills    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
224938d9b04SRichard Tran Mills    * then xgroup[] must consist of (maxnz + 2) elements, since the last
225938d9b04SRichard Tran Mills    * element of xgroup will tell us where the (maxnz + 1)th group ends.
226938d9b04SRichard Tran Mills    * We allocate space for the maximum number of groups;
227938d9b04SRichard Tran Mills    * that is potentially a little wasteful, but not too much so.
228938d9b04SRichard Tran Mills    * Perhaps I should fix it later. */
229938d9b04SRichard Tran Mills   ierr = PetscMalloc1(maxnz+2, &aijperm->xgroup);CHKERRQ(ierr);
230938d9b04SRichard Tran Mills   ierr = PetscMalloc1(maxnz+1, &aijperm->nzgroup);CHKERRQ(ierr);
231938d9b04SRichard Tran Mills 
232938d9b04SRichard Tran Mills   /* Second pass.  Look at what is in the buckets and create the groupings.
233938d9b04SRichard Tran Mills    * Note that it is OK to have a group of rows with no non-zero values. */
234938d9b04SRichard Tran Mills   ngroup = 0;
235938d9b04SRichard Tran Mills   istart = 0;
236938d9b04SRichard Tran Mills   for (i=0; i<=maxnz; i++) {
237938d9b04SRichard Tran Mills     if (rows_in_bucket[i] > 0) {
238938d9b04SRichard Tran Mills       aijperm->nzgroup[ngroup] = i;
239938d9b04SRichard Tran Mills       aijperm->xgroup[ngroup]  = istart;
240938d9b04SRichard Tran Mills       ngroup++;
241938d9b04SRichard Tran Mills       istart += rows_in_bucket[i];
242938d9b04SRichard Tran Mills     }
243938d9b04SRichard Tran Mills   }
244938d9b04SRichard Tran Mills 
245938d9b04SRichard Tran Mills   aijperm->xgroup[ngroup] = istart;
246938d9b04SRichard Tran Mills   aijperm->ngroup         = ngroup;
247938d9b04SRichard Tran Mills 
248938d9b04SRichard Tran Mills   /* Now fill in the permutation vector iperm. */
249938d9b04SRichard Tran Mills   ipnz[0] = 0;
250938d9b04SRichard Tran Mills   for (i=0; i<maxnz; i++) {
251938d9b04SRichard Tran Mills     ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
252938d9b04SRichard Tran Mills   }
253938d9b04SRichard Tran Mills 
254938d9b04SRichard Tran Mills   for (i=0; i<m; i++) {
255938d9b04SRichard Tran Mills     nz                   = nz_in_row[i];
256938d9b04SRichard Tran Mills     ipos                 = ipnz[nz];
257938d9b04SRichard Tran Mills     aijperm->iperm[ipos] = i;
258938d9b04SRichard Tran Mills     ipnz[nz]++;
259938d9b04SRichard Tran Mills   }
260938d9b04SRichard Tran Mills 
261938d9b04SRichard Tran Mills   /* Clean up temporary work arrays. */
262938d9b04SRichard Tran Mills   ierr = PetscFree(rows_in_bucket);CHKERRQ(ierr);
263938d9b04SRichard Tran Mills   ierr = PetscFree(ipnz);CHKERRQ(ierr);
264938d9b04SRichard Tran Mills   ierr = PetscFree(nz_in_row);CHKERRQ(ierr);
265938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
266938d9b04SRichard Tran Mills }
267938d9b04SRichard Tran Mills 
268938d9b04SRichard Tran Mills 
269938d9b04SRichard Tran Mills PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
270938d9b04SRichard Tran Mills {
271938d9b04SRichard Tran Mills   PetscErrorCode ierr;
272938d9b04SRichard Tran Mills   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
273938d9b04SRichard Tran Mills 
274938d9b04SRichard Tran Mills   PetscFunctionBegin;
275938d9b04SRichard Tran Mills   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
276938d9b04SRichard Tran Mills 
277938d9b04SRichard Tran Mills   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
278938d9b04SRichard Tran Mills    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
279938d9b04SRichard Tran Mills    * I'm not sure if this is the best way to do this, but it avoids
280938d9b04SRichard Tran Mills    * a lot of code duplication.
281938d9b04SRichard Tran Mills    * I also note that currently MATSEQAIJPERM doesn't know anything about
282938d9b04SRichard Tran Mills    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
283938d9b04SRichard Tran Mills    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
284938d9b04SRichard Tran Mills    * this, this may break things.  (Don't know... haven't looked at it.) */
285938d9b04SRichard Tran Mills   a->inode.use = PETSC_FALSE;
286938d9b04SRichard Tran Mills   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
287938d9b04SRichard Tran Mills 
288938d9b04SRichard Tran Mills   /* Now calculate the permutation and grouping information. */
289938d9b04SRichard Tran Mills   ierr = MatSeqAIJPERM_create_perm(A);CHKERRQ(ierr);
290938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
291938d9b04SRichard Tran Mills }
292938d9b04SRichard Tran Mills 
293938d9b04SRichard Tran Mills PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)
294938d9b04SRichard Tran Mills {
295938d9b04SRichard Tran Mills   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
296938d9b04SRichard Tran Mills   const PetscScalar *x;
297938d9b04SRichard Tran Mills   PetscScalar       *y;
298938d9b04SRichard Tran Mills   const MatScalar   *aa;
299938d9b04SRichard Tran Mills   PetscErrorCode    ierr;
300938d9b04SRichard Tran Mills   const PetscInt    *aj,*ai;
301938d9b04SRichard Tran Mills #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
302938d9b04SRichard Tran Mills   PetscInt          i,j;
303938d9b04SRichard Tran Mills #endif
304*80423c7aSSatish Balay #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
305f67b6f2eSRichard Tran Mills   __m512d           vec_x,vec_y,vec_vals;
306f67b6f2eSRichard Tran Mills   __m256i           vec_idx,vec_ipos,vec_j;
307f67b6f2eSRichard Tran Mills   __mmask8           mask;
308f67b6f2eSRichard Tran Mills #endif
309938d9b04SRichard Tran Mills 
310938d9b04SRichard Tran Mills   /* Variables that don't appear in MatMult_SeqAIJ. */
311938d9b04SRichard Tran Mills   Mat_SeqAIJPERM    *aijperm = (Mat_SeqAIJPERM*) A->spptr;
312938d9b04SRichard Tran Mills   PetscInt          *iperm;  /* Points to the permutation vector. */
313938d9b04SRichard Tran Mills   PetscInt          *xgroup;
314938d9b04SRichard Tran Mills   /* Denotes where groups of rows with same number of nonzeros
315938d9b04SRichard Tran Mills    * begin and end in iperm. */
316938d9b04SRichard Tran Mills   PetscInt          *nzgroup;
317938d9b04SRichard Tran Mills   PetscInt          ngroup;
318938d9b04SRichard Tran Mills   PetscInt          igroup;
319938d9b04SRichard Tran Mills   PetscInt          jstart,jend;
320938d9b04SRichard Tran Mills   /* jstart is used in loops to denote the position in iperm where a
321938d9b04SRichard Tran Mills    * group starts; jend denotes the position where it ends.
322938d9b04SRichard Tran Mills    * (jend + 1 is where the next group starts.) */
323938d9b04SRichard Tran Mills   PetscInt          iold,nz;
324938d9b04SRichard Tran Mills   PetscInt          istart,iend,isize;
325938d9b04SRichard Tran Mills   PetscInt          ipos;
326938d9b04SRichard Tran Mills   PetscScalar       yp[NDIM];
327938d9b04SRichard Tran Mills   PetscInt          ip[NDIM];    /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
328938d9b04SRichard Tran Mills 
329938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
330938d9b04SRichard Tran Mills #pragma disjoint(*x,*y,*aa)
331938d9b04SRichard Tran Mills #endif
332938d9b04SRichard Tran Mills 
333938d9b04SRichard Tran Mills   PetscFunctionBegin;
334938d9b04SRichard Tran Mills   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
335938d9b04SRichard Tran Mills   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
336938d9b04SRichard Tran Mills   aj   = a->j;   /* aj[k] gives column index for element aa[k]. */
337938d9b04SRichard Tran Mills   aa   = a->a; /* Nonzero elements stored row-by-row. */
338938d9b04SRichard Tran Mills   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
339938d9b04SRichard Tran Mills 
340938d9b04SRichard Tran Mills   /* Get the info we need about the permutations and groupings. */
341938d9b04SRichard Tran Mills   iperm   = aijperm->iperm;
342938d9b04SRichard Tran Mills   ngroup  = aijperm->ngroup;
343938d9b04SRichard Tran Mills   xgroup  = aijperm->xgroup;
344938d9b04SRichard Tran Mills   nzgroup = aijperm->nzgroup;
345938d9b04SRichard Tran Mills 
346938d9b04SRichard Tran Mills #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
347938d9b04SRichard Tran Mills   fortranmultaijperm_(&m,x,ii,aj,aa,y);
348938d9b04SRichard Tran Mills #else
349938d9b04SRichard Tran Mills 
350938d9b04SRichard Tran Mills   for (igroup=0; igroup<ngroup; igroup++) {
351938d9b04SRichard Tran Mills     jstart = xgroup[igroup];
352938d9b04SRichard Tran Mills     jend   = xgroup[igroup+1] - 1;
353938d9b04SRichard Tran Mills     nz     = nzgroup[igroup];
354938d9b04SRichard Tran Mills 
355938d9b04SRichard Tran Mills     /* Handle the special cases where the number of nonzeros per row
356938d9b04SRichard Tran Mills      * in the group is either 0 or 1. */
357938d9b04SRichard Tran Mills     if (nz == 0) {
358938d9b04SRichard Tran Mills       for (i=jstart; i<=jend; i++) {
359938d9b04SRichard Tran Mills         y[iperm[i]] = 0.0;
360938d9b04SRichard Tran Mills       }
361938d9b04SRichard Tran Mills     } else if (nz == 1) {
362938d9b04SRichard Tran Mills       for (i=jstart; i<=jend; i++) {
363938d9b04SRichard Tran Mills         iold    = iperm[i];
364938d9b04SRichard Tran Mills         ipos    = ai[iold];
365938d9b04SRichard Tran Mills         y[iold] = aa[ipos] * x[aj[ipos]];
366938d9b04SRichard Tran Mills       }
367938d9b04SRichard Tran Mills     } else {
368938d9b04SRichard Tran Mills 
369938d9b04SRichard Tran Mills       /* We work our way through the current group in chunks of NDIM rows
370938d9b04SRichard Tran Mills        * at a time. */
371938d9b04SRichard Tran Mills 
372938d9b04SRichard Tran Mills       for (istart=jstart; istart<=jend; istart+=NDIM) {
373938d9b04SRichard Tran Mills         /* Figure out where the chunk of 'isize' rows ends in iperm.
374938d9b04SRichard Tran Mills          * 'isize may of course be less than NDIM for the last chunk. */
375938d9b04SRichard Tran Mills         iend = istart + (NDIM - 1);
376938d9b04SRichard Tran Mills 
377938d9b04SRichard Tran Mills         if (iend > jend) iend = jend;
378938d9b04SRichard Tran Mills 
379938d9b04SRichard Tran Mills         isize = iend - istart + 1;
380938d9b04SRichard Tran Mills 
381938d9b04SRichard Tran Mills         /* Initialize the yp[] array that will be used to hold part of
382938d9b04SRichard Tran Mills          * the permuted results vector, and figure out where in aa each
383938d9b04SRichard Tran Mills          * row of the chunk will begin. */
384938d9b04SRichard Tran Mills         for (i=0; i<isize; i++) {
385938d9b04SRichard Tran Mills           iold = iperm[istart + i];
386938d9b04SRichard Tran Mills           /* iold is a row number from the matrix A *before* reordering. */
387938d9b04SRichard Tran Mills           ip[i] = ai[iold];
388938d9b04SRichard Tran Mills           /* ip[i] tells us where the ith row of the chunk begins in aa. */
389938d9b04SRichard Tran Mills           yp[i] = (PetscScalar) 0.0;
390938d9b04SRichard Tran Mills         }
391938d9b04SRichard Tran Mills 
392938d9b04SRichard Tran Mills         /* If the number of zeros per row exceeds the number of rows in
393938d9b04SRichard Tran Mills          * the chunk, we should vectorize along nz, that is, perform the
394938d9b04SRichard Tran Mills          * mat-vec one row at a time as in the usual CSR case. */
395938d9b04SRichard Tran Mills         if (nz > isize) {
396938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
397938d9b04SRichard Tran Mills #pragma _CRI preferstream
398938d9b04SRichard Tran Mills #endif
399938d9b04SRichard Tran Mills           for (i=0; i<isize; i++) {
400938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
401938d9b04SRichard Tran Mills #pragma _CRI prefervector
402938d9b04SRichard Tran Mills #endif
403f67b6f2eSRichard Tran Mills 
404*80423c7aSSatish Balay #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
405f67b6f2eSRichard Tran Mills             vec_y = _mm512_setzero_pd();
406f67b6f2eSRichard Tran Mills             ipos = ip[i];
407f67b6f2eSRichard Tran Mills             for (j=0; j<(nz>>3); j++) {
408f67b6f2eSRichard Tran Mills               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
409f67b6f2eSRichard Tran Mills               vec_vals = _mm512_loadu_pd(&aa[ipos]);
410f67b6f2eSRichard Tran Mills               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
411f67b6f2eSRichard Tran Mills               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
412f67b6f2eSRichard Tran Mills               ipos += 8;
413f67b6f2eSRichard Tran Mills             }
414f67b6f2eSRichard Tran Mills             if ((nz&0x07)>2) {
415f67b6f2eSRichard Tran Mills               mask     = (__mmask8)(0xff >> (8-(nz&0x07)));
416f67b6f2eSRichard Tran Mills               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
417f67b6f2eSRichard Tran Mills               vec_vals = _mm512_loadu_pd(&aa[ipos]);
418f67b6f2eSRichard Tran Mills               vec_x    = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8);
419f67b6f2eSRichard Tran Mills               vec_y    = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask);
420f67b6f2eSRichard Tran Mills             } else if ((nz&0x07)==2) {
421f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos]*x[aj[ipos]];
422f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos+1]*x[aj[ipos+1]];
423f67b6f2eSRichard Tran Mills             } else if ((nz&0x07)==1) {
424f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos]*x[aj[ipos]];
425f67b6f2eSRichard Tran Mills             }
426f67b6f2eSRichard Tran Mills             yp[i] += _mm512_reduce_add_pd(vec_y);
427f67b6f2eSRichard Tran Mills #else
428938d9b04SRichard Tran Mills             for (j=0; j<nz; j++) {
429938d9b04SRichard Tran Mills               ipos   = ip[i] + j;
430938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
431938d9b04SRichard Tran Mills             }
432f67b6f2eSRichard Tran Mills #endif
433938d9b04SRichard Tran Mills           }
434938d9b04SRichard Tran Mills         } else {
435938d9b04SRichard Tran Mills           /* Otherwise, there are enough rows in the chunk to make it
436938d9b04SRichard Tran Mills            * worthwhile to vectorize across the rows, that is, to do the
437938d9b04SRichard Tran Mills            * matvec by operating with "columns" of the chunk. */
438938d9b04SRichard Tran Mills           for (j=0; j<nz; j++) {
439*80423c7aSSatish Balay #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
440f67b6f2eSRichard Tran Mills             vec_j = _mm256_set1_epi32(j);
441f67b6f2eSRichard Tran Mills             for (i=0; i<((isize>>3)<<3); i+=8) {
442f67b6f2eSRichard Tran Mills               vec_y    = _mm512_loadu_pd(&yp[i]);
443f67b6f2eSRichard Tran Mills               vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]);
444f67b6f2eSRichard Tran Mills               vec_ipos = _mm256_add_epi32(vec_ipos,vec_j);
445f67b6f2eSRichard Tran Mills               vec_idx  = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4);
446f67b6f2eSRichard Tran Mills               vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8);
447f67b6f2eSRichard Tran Mills               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
448f67b6f2eSRichard Tran Mills               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
449f67b6f2eSRichard Tran Mills               _mm512_storeu_pd(&yp[i],vec_y);
450f67b6f2eSRichard Tran Mills             }
451f67b6f2eSRichard Tran Mills             for (i=isize-(isize&0x07); i<isize; i++) {
452f67b6f2eSRichard Tran Mills               ipos = ip[i]+j;
453f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos]*x[aj[ipos]];
454f67b6f2eSRichard Tran Mills             }
455f67b6f2eSRichard Tran Mills #else
456938d9b04SRichard Tran Mills             for (i=0; i<isize; i++) {
457938d9b04SRichard Tran Mills               ipos   = ip[i] + j;
458938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
459938d9b04SRichard Tran Mills             }
460f67b6f2eSRichard Tran Mills #endif
461938d9b04SRichard Tran Mills           }
462938d9b04SRichard Tran Mills         }
463938d9b04SRichard Tran Mills 
464938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
465938d9b04SRichard Tran Mills #pragma _CRI ivdep
466938d9b04SRichard Tran Mills #endif
467938d9b04SRichard Tran Mills         /* Put results from yp[] into non-permuted result vector y. */
468938d9b04SRichard Tran Mills         for (i=0; i<isize; i++) {
469938d9b04SRichard Tran Mills           y[iperm[istart+i]] = yp[i];
470938d9b04SRichard Tran Mills         }
471938d9b04SRichard Tran Mills       } /* End processing chunk of isize rows of a group. */
472938d9b04SRichard Tran Mills     } /* End handling matvec for chunk with nz > 1. */
473938d9b04SRichard Tran Mills   } /* End loop over igroup. */
474938d9b04SRichard Tran Mills #endif
475938d9b04SRichard Tran Mills   ierr = PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));CHKERRQ(ierr);
476938d9b04SRichard Tran Mills   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
477938d9b04SRichard Tran Mills   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
478938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
479938d9b04SRichard Tran Mills }
480938d9b04SRichard Tran Mills 
481938d9b04SRichard Tran Mills 
482938d9b04SRichard Tran Mills /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
483938d9b04SRichard Tran Mills  * Note that the names I used to designate the vectors differs from that
484938d9b04SRichard Tran Mills  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
485938d9b04SRichard Tran Mills  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
486938d9b04SRichard Tran Mills /*
487938d9b04SRichard Tran Mills     I hate having virtually identical code for the mult and the multadd!!!
488938d9b04SRichard Tran Mills */
489938d9b04SRichard Tran Mills PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)
490938d9b04SRichard Tran Mills {
491938d9b04SRichard Tran Mills   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
492938d9b04SRichard Tran Mills   const PetscScalar *x;
493938d9b04SRichard Tran Mills   PetscScalar       *y,*w;
494938d9b04SRichard Tran Mills   const MatScalar   *aa;
495938d9b04SRichard Tran Mills   PetscErrorCode    ierr;
496938d9b04SRichard Tran Mills   const PetscInt    *aj,*ai;
497938d9b04SRichard Tran Mills #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
498938d9b04SRichard Tran Mills   PetscInt i,j;
499938d9b04SRichard Tran Mills #endif
500938d9b04SRichard Tran Mills 
501938d9b04SRichard Tran Mills   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
502938d9b04SRichard Tran Mills   Mat_SeqAIJPERM * aijperm;
503938d9b04SRichard Tran Mills   PetscInt       *iperm;    /* Points to the permutation vector. */
504938d9b04SRichard Tran Mills   PetscInt       *xgroup;
505938d9b04SRichard Tran Mills   /* Denotes where groups of rows with same number of nonzeros
506938d9b04SRichard Tran Mills    * begin and end in iperm. */
507938d9b04SRichard Tran Mills   PetscInt *nzgroup;
508938d9b04SRichard Tran Mills   PetscInt ngroup;
509938d9b04SRichard Tran Mills   PetscInt igroup;
510938d9b04SRichard Tran Mills   PetscInt jstart,jend;
511938d9b04SRichard Tran Mills   /* jstart is used in loops to denote the position in iperm where a
512938d9b04SRichard Tran Mills    * group starts; jend denotes the position where it ends.
513938d9b04SRichard Tran Mills    * (jend + 1 is where the next group starts.) */
514938d9b04SRichard Tran Mills   PetscInt    iold,nz;
515938d9b04SRichard Tran Mills   PetscInt    istart,iend,isize;
516938d9b04SRichard Tran Mills   PetscInt    ipos;
517938d9b04SRichard Tran Mills   PetscScalar yp[NDIM];
518938d9b04SRichard Tran Mills   PetscInt    ip[NDIM];
519938d9b04SRichard Tran Mills   /* yp[] and ip[] are treated as vector "registers" for performing
520938d9b04SRichard Tran Mills    * the mat-vec. */
521938d9b04SRichard Tran Mills 
522938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
523938d9b04SRichard Tran Mills #pragma disjoint(*x,*y,*aa)
524938d9b04SRichard Tran Mills #endif
525938d9b04SRichard Tran Mills 
526938d9b04SRichard Tran Mills   PetscFunctionBegin;
527938d9b04SRichard Tran Mills   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
528938d9b04SRichard Tran Mills   ierr = VecGetArrayPair(yy,ww,&y,&w);CHKERRQ(ierr);
529938d9b04SRichard Tran Mills 
530938d9b04SRichard Tran Mills   aj = a->j;   /* aj[k] gives column index for element aa[k]. */
531938d9b04SRichard Tran Mills   aa = a->a;   /* Nonzero elements stored row-by-row. */
532938d9b04SRichard Tran Mills   ai = a->i;   /* ai[k] is the position in aa and aj where row k starts. */
533938d9b04SRichard Tran Mills 
534938d9b04SRichard Tran Mills   /* Get the info we need about the permutations and groupings. */
535938d9b04SRichard Tran Mills   aijperm = (Mat_SeqAIJPERM*) A->spptr;
536938d9b04SRichard Tran Mills   iperm   = aijperm->iperm;
537938d9b04SRichard Tran Mills   ngroup  = aijperm->ngroup;
538938d9b04SRichard Tran Mills   xgroup  = aijperm->xgroup;
539938d9b04SRichard Tran Mills   nzgroup = aijperm->nzgroup;
540938d9b04SRichard Tran Mills 
541938d9b04SRichard Tran Mills #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
542938d9b04SRichard Tran Mills   fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w);
543938d9b04SRichard Tran Mills #else
544938d9b04SRichard Tran Mills 
545938d9b04SRichard Tran Mills   for (igroup=0; igroup<ngroup; igroup++) {
546938d9b04SRichard Tran Mills     jstart = xgroup[igroup];
547938d9b04SRichard Tran Mills     jend   = xgroup[igroup+1] - 1;
548938d9b04SRichard Tran Mills 
549938d9b04SRichard Tran Mills     nz = nzgroup[igroup];
550938d9b04SRichard Tran Mills 
551938d9b04SRichard Tran Mills     /* Handle the special cases where the number of nonzeros per row
552938d9b04SRichard Tran Mills      * in the group is either 0 or 1. */
553938d9b04SRichard Tran Mills     if (nz == 0) {
554938d9b04SRichard Tran Mills       for (i=jstart; i<=jend; i++) {
555938d9b04SRichard Tran Mills         iold    = iperm[i];
556938d9b04SRichard Tran Mills         y[iold] = w[iold];
557938d9b04SRichard Tran Mills       }
558938d9b04SRichard Tran Mills     }
559938d9b04SRichard Tran Mills     else if (nz == 1) {
560938d9b04SRichard Tran Mills       for (i=jstart; i<=jend; i++) {
561938d9b04SRichard Tran Mills         iold    = iperm[i];
562938d9b04SRichard Tran Mills         ipos    = ai[iold];
563938d9b04SRichard Tran Mills         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
564938d9b04SRichard Tran Mills       }
565938d9b04SRichard Tran Mills     }
566938d9b04SRichard Tran Mills     /* For the general case: */
567938d9b04SRichard Tran Mills     else {
568938d9b04SRichard Tran Mills 
569938d9b04SRichard Tran Mills       /* We work our way through the current group in chunks of NDIM rows
570938d9b04SRichard Tran Mills        * at a time. */
571938d9b04SRichard Tran Mills 
572938d9b04SRichard Tran Mills       for (istart=jstart; istart<=jend; istart+=NDIM) {
573938d9b04SRichard Tran Mills         /* Figure out where the chunk of 'isize' rows ends in iperm.
574938d9b04SRichard Tran Mills          * 'isize may of course be less than NDIM for the last chunk. */
575938d9b04SRichard Tran Mills         iend = istart + (NDIM - 1);
576938d9b04SRichard Tran Mills         if (iend > jend) iend = jend;
577938d9b04SRichard Tran Mills         isize = iend - istart + 1;
578938d9b04SRichard Tran Mills 
579938d9b04SRichard Tran Mills         /* Initialize the yp[] array that will be used to hold part of
580938d9b04SRichard Tran Mills          * the permuted results vector, and figure out where in aa each
581938d9b04SRichard Tran Mills          * row of the chunk will begin. */
582938d9b04SRichard Tran Mills         for (i=0; i<isize; i++) {
583938d9b04SRichard Tran Mills           iold = iperm[istart + i];
584938d9b04SRichard Tran Mills           /* iold is a row number from the matrix A *before* reordering. */
585938d9b04SRichard Tran Mills           ip[i] = ai[iold];
586938d9b04SRichard Tran Mills           /* ip[i] tells us where the ith row of the chunk begins in aa. */
587938d9b04SRichard Tran Mills           yp[i] = w[iold];
588938d9b04SRichard Tran Mills         }
589938d9b04SRichard Tran Mills 
590938d9b04SRichard Tran Mills         /* If the number of zeros per row exceeds the number of rows in
591938d9b04SRichard Tran Mills          * the chunk, we should vectorize along nz, that is, perform the
592938d9b04SRichard Tran Mills          * mat-vec one row at a time as in the usual CSR case. */
593938d9b04SRichard Tran Mills         if (nz > isize) {
594938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
595938d9b04SRichard Tran Mills #pragma _CRI preferstream
596938d9b04SRichard Tran Mills #endif
597938d9b04SRichard Tran Mills           for (i=0; i<isize; i++) {
598938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
599938d9b04SRichard Tran Mills #pragma _CRI prefervector
600938d9b04SRichard Tran Mills #endif
601938d9b04SRichard Tran Mills             for (j=0; j<nz; j++) {
602938d9b04SRichard Tran Mills               ipos   = ip[i] + j;
603938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
604938d9b04SRichard Tran Mills             }
605938d9b04SRichard Tran Mills           }
606938d9b04SRichard Tran Mills         }
607938d9b04SRichard Tran Mills         /* Otherwise, there are enough rows in the chunk to make it
608938d9b04SRichard Tran Mills          * worthwhile to vectorize across the rows, that is, to do the
609938d9b04SRichard Tran Mills          * matvec by operating with "columns" of the chunk. */
610938d9b04SRichard Tran Mills         else {
611938d9b04SRichard Tran Mills           for (j=0; j<nz; j++) {
612938d9b04SRichard Tran Mills             for (i=0; i<isize; i++) {
613938d9b04SRichard Tran Mills               ipos   = ip[i] + j;
614938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
615938d9b04SRichard Tran Mills             }
616938d9b04SRichard Tran Mills           }
617938d9b04SRichard Tran Mills         }
618938d9b04SRichard Tran Mills 
619938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
620938d9b04SRichard Tran Mills #pragma _CRI ivdep
621938d9b04SRichard Tran Mills #endif
622938d9b04SRichard Tran Mills         /* Put results from yp[] into non-permuted result vector y. */
623938d9b04SRichard Tran Mills         for (i=0; i<isize; i++) {
624938d9b04SRichard Tran Mills           y[iperm[istart+i]] = yp[i];
625938d9b04SRichard Tran Mills         }
626938d9b04SRichard Tran Mills       } /* End processing chunk of isize rows of a group. */
627938d9b04SRichard Tran Mills 
628938d9b04SRichard Tran Mills     } /* End handling matvec for chunk with nz > 1. */
629938d9b04SRichard Tran Mills   } /* End loop over igroup. */
630938d9b04SRichard Tran Mills 
631938d9b04SRichard Tran Mills #endif
632938d9b04SRichard Tran Mills   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
633938d9b04SRichard Tran Mills   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
634938d9b04SRichard Tran Mills   ierr = VecRestoreArrayPair(yy,ww,&y,&w);CHKERRQ(ierr);
635938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
636938d9b04SRichard Tran Mills }
637938d9b04SRichard Tran Mills 
6380d26a50cSRichard Tran Mills /* This function prototype is needed in MatConvert_SeqAIJ_SeqAIJPERM(), below. */
6390d26a50cSRichard Tran Mills PETSC_INTERN PetscErrorCode MatPtAP_IS_XAIJ(Mat,Mat,MatReuse,PetscReal,Mat*);
640938d9b04SRichard Tran Mills 
641938d9b04SRichard Tran Mills /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
642938d9b04SRichard Tran Mills  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
643938d9b04SRichard Tran Mills  * routine, but can also be used to convert an assembled SeqAIJ matrix
644938d9b04SRichard Tran Mills  * into a SeqAIJPERM one. */
645938d9b04SRichard Tran Mills PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat)
646938d9b04SRichard Tran Mills {
647938d9b04SRichard Tran Mills   PetscErrorCode ierr;
648938d9b04SRichard Tran Mills   Mat            B = *newmat;
649938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm;
650938d9b04SRichard Tran Mills   PetscBool      sametype;
651938d9b04SRichard Tran Mills 
652938d9b04SRichard Tran Mills   PetscFunctionBegin;
653938d9b04SRichard Tran Mills   if (reuse == MAT_INITIAL_MATRIX) {
654938d9b04SRichard Tran Mills     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
655938d9b04SRichard Tran Mills   }
656938d9b04SRichard Tran Mills   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
657938d9b04SRichard Tran Mills   if (sametype) PetscFunctionReturn(0);
658938d9b04SRichard Tran Mills 
659938d9b04SRichard Tran Mills   ierr     = PetscNewLog(B,&aijperm);CHKERRQ(ierr);
660938d9b04SRichard Tran Mills   B->spptr = (void*) aijperm;
661938d9b04SRichard Tran Mills 
662938d9b04SRichard Tran Mills   /* Set function pointers for methods that we inherit from AIJ but override. */
663938d9b04SRichard Tran Mills   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
664938d9b04SRichard Tran Mills   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
665938d9b04SRichard Tran Mills   B->ops->destroy     = MatDestroy_SeqAIJPERM;
666938d9b04SRichard Tran Mills   B->ops->mult        = MatMult_SeqAIJPERM;
667938d9b04SRichard Tran Mills   B->ops->multadd     = MatMultAdd_SeqAIJPERM;
668938d9b04SRichard Tran Mills 
669938d9b04SRichard Tran Mills   aijperm->nonzerostate = -1;  /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
670938d9b04SRichard Tran Mills   /* If A has already been assembled, compute the permutation. */
671938d9b04SRichard Tran Mills   if (A->assembled) {
672938d9b04SRichard Tran Mills     ierr = MatSeqAIJPERM_create_perm(B);CHKERRQ(ierr);
673938d9b04SRichard Tran Mills   }
674938d9b04SRichard Tran Mills 
675938d9b04SRichard Tran Mills   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr);
676938d9b04SRichard Tran Mills   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr);
677938d9b04SRichard Tran Mills   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr);
678938d9b04SRichard Tran Mills   ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr);
6790d26a50cSRichard Tran Mills   ierr = PetscObjectComposeFunction((PetscObject)B,"MatPtAP_is_seqaijperm_C",MatPtAP_IS_XAIJ);CHKERRQ(ierr);
680938d9b04SRichard Tran Mills 
681938d9b04SRichard Tran Mills   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);CHKERRQ(ierr);
682938d9b04SRichard Tran Mills   *newmat = B;
683938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
684938d9b04SRichard Tran Mills }
685938d9b04SRichard Tran Mills 
686938d9b04SRichard Tran Mills /*@C
687938d9b04SRichard Tran Mills    MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM.
688938d9b04SRichard Tran Mills    This type inherits from AIJ, but calculates some additional permutation
689938d9b04SRichard Tran Mills    information that is used to allow better vectorization of some
690938d9b04SRichard Tran Mills    operations.  At the cost of increased storage, the AIJ formatted
691938d9b04SRichard Tran Mills    matrix can be copied to a format in which pieces of the matrix are
692938d9b04SRichard Tran Mills    stored in ELLPACK format, allowing the vectorized matrix multiply
693938d9b04SRichard Tran Mills    routine to use stride-1 memory accesses.  As with the AIJ type, it is
694938d9b04SRichard Tran Mills    important to preallocate matrix storage in order to get good assembly
695938d9b04SRichard Tran Mills    performance.
696938d9b04SRichard Tran Mills 
697d083f849SBarry Smith    Collective
698938d9b04SRichard Tran Mills 
699938d9b04SRichard Tran Mills    Input Parameters:
700938d9b04SRichard Tran Mills +  comm - MPI communicator, set to PETSC_COMM_SELF
701938d9b04SRichard Tran Mills .  m - number of rows
702938d9b04SRichard Tran Mills .  n - number of columns
703938d9b04SRichard Tran Mills .  nz - number of nonzeros per row (same for all rows)
704938d9b04SRichard Tran Mills -  nnz - array containing the number of nonzeros in the various rows
705938d9b04SRichard Tran Mills          (possibly different for each row) or NULL
706938d9b04SRichard Tran Mills 
707938d9b04SRichard Tran Mills    Output Parameter:
708938d9b04SRichard Tran Mills .  A - the matrix
709938d9b04SRichard Tran Mills 
710938d9b04SRichard Tran Mills    Notes:
711938d9b04SRichard Tran Mills    If nnz is given then nz is ignored
712938d9b04SRichard Tran Mills 
713938d9b04SRichard Tran Mills    Level: intermediate
714938d9b04SRichard Tran Mills 
715938d9b04SRichard Tran Mills .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues()
716938d9b04SRichard Tran Mills @*/
717938d9b04SRichard Tran Mills PetscErrorCode  MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
718938d9b04SRichard Tran Mills {
719938d9b04SRichard Tran Mills   PetscErrorCode ierr;
720938d9b04SRichard Tran Mills 
721938d9b04SRichard Tran Mills   PetscFunctionBegin;
722938d9b04SRichard Tran Mills   ierr = MatCreate(comm,A);CHKERRQ(ierr);
723938d9b04SRichard Tran Mills   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
724938d9b04SRichard Tran Mills   ierr = MatSetType(*A,MATSEQAIJPERM);CHKERRQ(ierr);
725938d9b04SRichard Tran Mills   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
726938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
727938d9b04SRichard Tran Mills }
728938d9b04SRichard Tran Mills 
729938d9b04SRichard Tran Mills PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
730938d9b04SRichard Tran Mills {
731938d9b04SRichard Tran Mills   PetscErrorCode ierr;
732938d9b04SRichard Tran Mills 
733938d9b04SRichard Tran Mills   PetscFunctionBegin;
734938d9b04SRichard Tran Mills   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
735938d9b04SRichard Tran Mills   ierr = MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
736938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
737938d9b04SRichard Tran Mills }
738938d9b04SRichard Tran Mills 
739