xref: /petsc/src/mat/impls/aij/seq/aijperm/aijperm.c (revision 6aad120caa16b1027d343a5f30f73d01448e4dc0)
1938d9b04SRichard Tran Mills 
2938d9b04SRichard Tran Mills /*
3938d9b04SRichard Tran Mills   Defines basic operations for the MATSEQAIJPERM matrix class.
4938d9b04SRichard Tran Mills   This class is derived from the MATSEQAIJ class and retains the
5938d9b04SRichard Tran Mills   compressed row storage (aka Yale sparse matrix format) but augments
6938d9b04SRichard Tran Mills   it with some permutation information that enables some operations
7938d9b04SRichard Tran Mills   to be more vectorizable.  A physically rearranged copy of the matrix
8938d9b04SRichard Tran Mills   may be stored if the user desires.
9938d9b04SRichard Tran Mills 
10938d9b04SRichard Tran Mills   Eventually a variety of permutations may be supported.
11938d9b04SRichard Tran Mills */
12938d9b04SRichard Tran Mills 
13938d9b04SRichard Tran Mills #include <../src/mat/impls/aij/seq/aij.h>
14938d9b04SRichard Tran Mills 
1580423c7aSSatish Balay #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16f67b6f2eSRichard Tran Mills #include <immintrin.h>
17f67b6f2eSRichard Tran Mills 
18f67b6f2eSRichard Tran Mills #if !defined(_MM_SCALE_8)
19f67b6f2eSRichard Tran Mills #define _MM_SCALE_8    8
20f67b6f2eSRichard Tran Mills #endif
21f67b6f2eSRichard Tran Mills #if !defined(_MM_SCALE_4)
22f67b6f2eSRichard Tran Mills #define _MM_SCALE_4    4
23f67b6f2eSRichard Tran Mills #endif
24f67b6f2eSRichard Tran Mills #endif
25f67b6f2eSRichard Tran Mills 
26938d9b04SRichard Tran Mills #define NDIM 512
27938d9b04SRichard Tran Mills /* NDIM specifies how many rows at a time we should work with when
28938d9b04SRichard Tran Mills  * performing the vectorized mat-vec.  This depends on various factors
29938d9b04SRichard Tran Mills  * such as vector register length, etc., and I really need to add a
30938d9b04SRichard Tran Mills  * way for the user (or the library) to tune this.  I'm setting it to
31938d9b04SRichard Tran Mills  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32938d9b04SRichard Tran Mills  * routines. */
33938d9b04SRichard Tran Mills 
34938d9b04SRichard Tran Mills typedef struct {
35938d9b04SRichard Tran Mills   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
36938d9b04SRichard Tran Mills 
37938d9b04SRichard Tran Mills   PetscInt         ngroup;
38938d9b04SRichard Tran Mills   PetscInt         *xgroup;
39938d9b04SRichard Tran Mills   /* Denotes where groups of rows with same number of nonzeros
40938d9b04SRichard Tran Mills    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41938d9b04SRichard Tran Mills    * where the ith group begins. */
42938d9b04SRichard Tran Mills 
43938d9b04SRichard Tran Mills   PetscInt         *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
44938d9b04SRichard Tran Mills   PetscInt         *iperm;  /* The permutation vector. */
45938d9b04SRichard Tran Mills 
46938d9b04SRichard Tran Mills   /* Some of this stuff is for Ed's recursive triangular solve.
47938d9b04SRichard Tran Mills    * I'm not sure what I need yet. */
48938d9b04SRichard Tran Mills   PetscInt         blocksize;
49938d9b04SRichard Tran Mills   PetscInt         nstep;
50938d9b04SRichard Tran Mills   PetscInt         *jstart_list;
51938d9b04SRichard Tran Mills   PetscInt         *jend_list;
52938d9b04SRichard Tran Mills   PetscInt         *action_list;
53938d9b04SRichard Tran Mills   PetscInt         *ngroup_list;
54938d9b04SRichard Tran Mills   PetscInt         **ipointer_list;
55938d9b04SRichard Tran Mills   PetscInt         **xgroup_list;
56938d9b04SRichard Tran Mills   PetscInt         **nzgroup_list;
57938d9b04SRichard Tran Mills   PetscInt         **iperm_list;
58938d9b04SRichard Tran Mills } Mat_SeqAIJPERM;
59938d9b04SRichard Tran Mills 
60938d9b04SRichard Tran Mills PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
61938d9b04SRichard Tran Mills {
62938d9b04SRichard Tran Mills   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
63938d9b04SRichard Tran Mills   /* so we will ignore 'MatType type'. */
64938d9b04SRichard Tran Mills   Mat            B       = *newmat;
65938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr;
66938d9b04SRichard Tran Mills 
67938d9b04SRichard Tran Mills   PetscFunctionBegin;
68938d9b04SRichard Tran Mills   if (reuse == MAT_INITIAL_MATRIX) {
699566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,&B));
70938d9b04SRichard Tran Mills     aijperm=(Mat_SeqAIJPERM*)B->spptr;
71938d9b04SRichard Tran Mills   }
72938d9b04SRichard Tran Mills 
73938d9b04SRichard Tran Mills   /* Reset the original function pointers. */
74938d9b04SRichard Tran Mills   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
75938d9b04SRichard Tran Mills   B->ops->destroy     = MatDestroy_SeqAIJ;
76938d9b04SRichard Tran Mills   B->ops->duplicate   = MatDuplicate_SeqAIJ;
77938d9b04SRichard Tran Mills   B->ops->mult        = MatMult_SeqAIJ;
78938d9b04SRichard Tran Mills   B->ops->multadd     = MatMultAdd_SeqAIJ;
79938d9b04SRichard Tran Mills 
809566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL));
81938d9b04SRichard Tran Mills 
82938d9b04SRichard Tran Mills   /* Free everything in the Mat_SeqAIJPERM data structure.*/
839566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->xgroup));
849566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->nzgroup));
859566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->iperm));
869566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->spptr));
87938d9b04SRichard Tran Mills 
88938d9b04SRichard Tran Mills   /* Change the type of B to MATSEQAIJ. */
899566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ));
90938d9b04SRichard Tran Mills 
91938d9b04SRichard Tran Mills   *newmat = B;
92938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
93938d9b04SRichard Tran Mills }
94938d9b04SRichard Tran Mills 
95938d9b04SRichard Tran Mills PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
96938d9b04SRichard Tran Mills {
97938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
98938d9b04SRichard Tran Mills 
99938d9b04SRichard Tran Mills   PetscFunctionBegin;
100938d9b04SRichard Tran Mills   if (aijperm) {
101938d9b04SRichard Tran Mills     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
1029566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm->xgroup));
1039566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm->nzgroup));
1049566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm->iperm));
1059566063dSJacob Faibussowitsch     PetscCall(PetscFree(A->spptr));
106938d9b04SRichard Tran Mills   }
107938d9b04SRichard Tran Mills   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
108938d9b04SRichard Tran Mills    * to destroy everything that remains. */
1099566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ));
110938d9b04SRichard Tran Mills   /* Note that I don't call MatSetType().  I believe this is because that
111938d9b04SRichard Tran Mills    * is only to be called when *building* a matrix.  I could be wrong, but
112938d9b04SRichard Tran Mills    * that is how things work for the SuperLU matrix class. */
1139566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
114938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
115938d9b04SRichard Tran Mills }
116938d9b04SRichard Tran Mills 
117938d9b04SRichard Tran Mills PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
118938d9b04SRichard Tran Mills {
119938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm      = (Mat_SeqAIJPERM*) A->spptr;
120938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm_dest;
121938d9b04SRichard Tran Mills   PetscBool      perm;
122938d9b04SRichard Tran Mills 
123938d9b04SRichard Tran Mills   PetscFunctionBegin;
1249566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A,op,M));
1259566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm));
126938d9b04SRichard Tran Mills   if (perm) {
127938d9b04SRichard Tran Mills     aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr;
1289566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm_dest->xgroup));
1299566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm_dest->nzgroup));
1309566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm_dest->iperm));
131938d9b04SRichard Tran Mills   } else {
1329566063dSJacob Faibussowitsch     PetscCall(PetscNewLog(*M,&aijperm_dest));
133938d9b04SRichard Tran Mills     (*M)->spptr = (void*) aijperm_dest;
1349566063dSJacob Faibussowitsch     PetscCall(PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM));
1359566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ));
136938d9b04SRichard Tran Mills   }
1379566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(aijperm_dest,aijperm,1));
138938d9b04SRichard Tran Mills   /* Allocate space for, and copy the grouping and permutation info.
139938d9b04SRichard Tran Mills    * I note that when the groups are initially determined in
140938d9b04SRichard Tran Mills    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
141938d9b04SRichard Tran Mills    * necessary.  But at this point, we know how large they need to be, and
142938d9b04SRichard Tran Mills    * allocate only the necessary amount of memory.  So the duplicated matrix
143938d9b04SRichard Tran Mills    * may actually use slightly less storage than the original! */
1449566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm));
1459566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup));
1469566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup));
1479566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n));
1489566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1));
1499566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup));
150938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
151938d9b04SRichard Tran Mills }
152938d9b04SRichard Tran Mills 
153938d9b04SRichard Tran Mills PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
154938d9b04SRichard Tran Mills {
155938d9b04SRichard Tran Mills   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)(A)->data;
156938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
157938d9b04SRichard Tran Mills   PetscInt       m;       /* Number of rows in the matrix. */
158938d9b04SRichard Tran Mills   PetscInt       *ia;       /* From the CSR representation; points to the beginning  of each row. */
159938d9b04SRichard Tran Mills   PetscInt       maxnz;      /* Maximum number of nonzeros in any row. */
160938d9b04SRichard Tran Mills   PetscInt       *rows_in_bucket;
161938d9b04SRichard Tran Mills   /* To construct the permutation, we sort each row into one of maxnz
162938d9b04SRichard Tran Mills    * buckets based on how many nonzeros are in the row. */
163938d9b04SRichard Tran Mills   PetscInt       nz;
164938d9b04SRichard Tran Mills   PetscInt       *nz_in_row;         /* the number of nonzero elements in row k. */
165938d9b04SRichard Tran Mills   PetscInt       *ipnz;
166938d9b04SRichard Tran Mills   /* When constructing the iperm permutation vector,
167938d9b04SRichard Tran Mills    * ipnz[nz] is used to point to the next place in the permutation vector
168938d9b04SRichard Tran Mills    * that a row with nz nonzero elements should be placed.*/
169938d9b04SRichard Tran Mills   PetscInt       i, ngroup, istart, ipos;
170938d9b04SRichard Tran Mills 
171938d9b04SRichard Tran Mills   PetscFunctionBegin;
172938d9b04SRichard Tran Mills   if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */
173938d9b04SRichard Tran Mills   aijperm->nonzerostate = A->nonzerostate;
174938d9b04SRichard Tran Mills  /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
1759566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->xgroup));
1769566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->nzgroup));
1779566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->iperm));
178938d9b04SRichard Tran Mills 
179938d9b04SRichard Tran Mills   m  = A->rmap->n;
180938d9b04SRichard Tran Mills   ia = a->i;
181938d9b04SRichard Tran Mills 
182938d9b04SRichard Tran Mills   /* Allocate the arrays that will hold the permutation vector. */
1839566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aijperm->iperm));
184938d9b04SRichard Tran Mills 
185938d9b04SRichard Tran Mills   /* Allocate some temporary work arrays that will be used in
186*6aad120cSJose E. Roman    * calculating the permutation vector and groupings. */
1879566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &nz_in_row));
188938d9b04SRichard Tran Mills 
189938d9b04SRichard Tran Mills   /* Now actually figure out the permutation and grouping. */
190938d9b04SRichard Tran Mills 
191938d9b04SRichard Tran Mills   /* First pass: Determine number of nonzeros in each row, maximum
192938d9b04SRichard Tran Mills    * number of nonzeros in any row, and how many rows fall into each
193938d9b04SRichard Tran Mills    * "bucket" of rows with same number of nonzeros. */
194938d9b04SRichard Tran Mills   maxnz = 0;
195938d9b04SRichard Tran Mills   for (i=0; i<m; i++) {
196938d9b04SRichard Tran Mills     nz_in_row[i] = ia[i+1]-ia[i];
197938d9b04SRichard Tran Mills     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
198938d9b04SRichard Tran Mills   }
1999566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket));
2009566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz));
201938d9b04SRichard Tran Mills 
202938d9b04SRichard Tran Mills   for (i=0; i<=maxnz; i++) {
203938d9b04SRichard Tran Mills     rows_in_bucket[i] = 0;
204938d9b04SRichard Tran Mills   }
205938d9b04SRichard Tran Mills   for (i=0; i<m; i++) {
206938d9b04SRichard Tran Mills     nz = nz_in_row[i];
207938d9b04SRichard Tran Mills     rows_in_bucket[nz]++;
208938d9b04SRichard Tran Mills   }
209938d9b04SRichard Tran Mills 
210938d9b04SRichard Tran Mills   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
211938d9b04SRichard Tran Mills    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
212938d9b04SRichard Tran Mills    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
213938d9b04SRichard Tran Mills    * then xgroup[] must consist of (maxnz + 2) elements, since the last
214938d9b04SRichard Tran Mills    * element of xgroup will tell us where the (maxnz + 1)th group ends.
215938d9b04SRichard Tran Mills    * We allocate space for the maximum number of groups;
216938d9b04SRichard Tran Mills    * that is potentially a little wasteful, but not too much so.
217938d9b04SRichard Tran Mills    * Perhaps I should fix it later. */
2189566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(maxnz+2, &aijperm->xgroup));
2199566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(maxnz+1, &aijperm->nzgroup));
220938d9b04SRichard Tran Mills 
221938d9b04SRichard Tran Mills   /* Second pass.  Look at what is in the buckets and create the groupings.
222938d9b04SRichard Tran Mills    * Note that it is OK to have a group of rows with no non-zero values. */
223938d9b04SRichard Tran Mills   ngroup = 0;
224938d9b04SRichard Tran Mills   istart = 0;
225938d9b04SRichard Tran Mills   for (i=0; i<=maxnz; i++) {
226938d9b04SRichard Tran Mills     if (rows_in_bucket[i] > 0) {
227938d9b04SRichard Tran Mills       aijperm->nzgroup[ngroup] = i;
228938d9b04SRichard Tran Mills       aijperm->xgroup[ngroup]  = istart;
229938d9b04SRichard Tran Mills       ngroup++;
230938d9b04SRichard Tran Mills       istart += rows_in_bucket[i];
231938d9b04SRichard Tran Mills     }
232938d9b04SRichard Tran Mills   }
233938d9b04SRichard Tran Mills 
234938d9b04SRichard Tran Mills   aijperm->xgroup[ngroup] = istart;
235938d9b04SRichard Tran Mills   aijperm->ngroup         = ngroup;
236938d9b04SRichard Tran Mills 
237938d9b04SRichard Tran Mills   /* Now fill in the permutation vector iperm. */
238938d9b04SRichard Tran Mills   ipnz[0] = 0;
239938d9b04SRichard Tran Mills   for (i=0; i<maxnz; i++) {
240938d9b04SRichard Tran Mills     ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
241938d9b04SRichard Tran Mills   }
242938d9b04SRichard Tran Mills 
243938d9b04SRichard Tran Mills   for (i=0; i<m; i++) {
244938d9b04SRichard Tran Mills     nz                   = nz_in_row[i];
245938d9b04SRichard Tran Mills     ipos                 = ipnz[nz];
246938d9b04SRichard Tran Mills     aijperm->iperm[ipos] = i;
247938d9b04SRichard Tran Mills     ipnz[nz]++;
248938d9b04SRichard Tran Mills   }
249938d9b04SRichard Tran Mills 
250938d9b04SRichard Tran Mills   /* Clean up temporary work arrays. */
2519566063dSJacob Faibussowitsch   PetscCall(PetscFree(rows_in_bucket));
2529566063dSJacob Faibussowitsch   PetscCall(PetscFree(ipnz));
2539566063dSJacob Faibussowitsch   PetscCall(PetscFree(nz_in_row));
254938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
255938d9b04SRichard Tran Mills }
256938d9b04SRichard Tran Mills 
257938d9b04SRichard Tran Mills PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
258938d9b04SRichard Tran Mills {
259938d9b04SRichard Tran Mills   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
260938d9b04SRichard Tran Mills 
261938d9b04SRichard Tran Mills   PetscFunctionBegin;
262938d9b04SRichard Tran Mills   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
263938d9b04SRichard Tran Mills 
264938d9b04SRichard Tran Mills   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
265938d9b04SRichard Tran Mills    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
266938d9b04SRichard Tran Mills    * I'm not sure if this is the best way to do this, but it avoids
267938d9b04SRichard Tran Mills    * a lot of code duplication.
268938d9b04SRichard Tran Mills    * I also note that currently MATSEQAIJPERM doesn't know anything about
269938d9b04SRichard Tran Mills    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
270938d9b04SRichard Tran Mills    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
271938d9b04SRichard Tran Mills    * this, this may break things.  (Don't know... haven't looked at it.) */
272938d9b04SRichard Tran Mills   a->inode.use = PETSC_FALSE;
2739566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
274938d9b04SRichard Tran Mills 
275938d9b04SRichard Tran Mills   /* Now calculate the permutation and grouping information. */
2769566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJPERM_create_perm(A));
277938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
278938d9b04SRichard Tran Mills }
279938d9b04SRichard Tran Mills 
280938d9b04SRichard Tran Mills PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)
281938d9b04SRichard Tran Mills {
282938d9b04SRichard Tran Mills   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
283938d9b04SRichard Tran Mills   const PetscScalar *x;
284938d9b04SRichard Tran Mills   PetscScalar       *y;
285938d9b04SRichard Tran Mills   const MatScalar   *aa;
286938d9b04SRichard Tran Mills   const PetscInt    *aj,*ai;
287938d9b04SRichard Tran Mills #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
288938d9b04SRichard Tran Mills   PetscInt          i,j;
289938d9b04SRichard Tran Mills #endif
29080423c7aSSatish Balay #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
291f67b6f2eSRichard Tran Mills   __m512d           vec_x,vec_y,vec_vals;
292f67b6f2eSRichard Tran Mills   __m256i           vec_idx,vec_ipos,vec_j;
293f67b6f2eSRichard Tran Mills   __mmask8           mask;
294f67b6f2eSRichard Tran Mills #endif
295938d9b04SRichard Tran Mills 
296938d9b04SRichard Tran Mills   /* Variables that don't appear in MatMult_SeqAIJ. */
297938d9b04SRichard Tran Mills   Mat_SeqAIJPERM    *aijperm = (Mat_SeqAIJPERM*) A->spptr;
298938d9b04SRichard Tran Mills   PetscInt          *iperm;  /* Points to the permutation vector. */
299938d9b04SRichard Tran Mills   PetscInt          *xgroup;
300938d9b04SRichard Tran Mills   /* Denotes where groups of rows with same number of nonzeros
301938d9b04SRichard Tran Mills    * begin and end in iperm. */
302938d9b04SRichard Tran Mills   PetscInt          *nzgroup;
303938d9b04SRichard Tran Mills   PetscInt          ngroup;
304938d9b04SRichard Tran Mills   PetscInt          igroup;
305938d9b04SRichard Tran Mills   PetscInt          jstart,jend;
306938d9b04SRichard Tran Mills   /* jstart is used in loops to denote the position in iperm where a
307938d9b04SRichard Tran Mills    * group starts; jend denotes the position where it ends.
308938d9b04SRichard Tran Mills    * (jend + 1 is where the next group starts.) */
309938d9b04SRichard Tran Mills   PetscInt          iold,nz;
310938d9b04SRichard Tran Mills   PetscInt          istart,iend,isize;
311938d9b04SRichard Tran Mills   PetscInt          ipos;
312938d9b04SRichard Tran Mills   PetscScalar       yp[NDIM];
313938d9b04SRichard Tran Mills   PetscInt          ip[NDIM];    /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
314938d9b04SRichard Tran Mills 
315938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
316938d9b04SRichard Tran Mills #pragma disjoint(*x,*y,*aa)
317938d9b04SRichard Tran Mills #endif
318938d9b04SRichard Tran Mills 
319938d9b04SRichard Tran Mills   PetscFunctionBegin;
3209566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx,&x));
3219566063dSJacob Faibussowitsch   PetscCall(VecGetArray(yy,&y));
322938d9b04SRichard Tran Mills   aj   = a->j;   /* aj[k] gives column index for element aa[k]. */
323938d9b04SRichard Tran Mills   aa   = a->a; /* Nonzero elements stored row-by-row. */
324938d9b04SRichard Tran Mills   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
325938d9b04SRichard Tran Mills 
326938d9b04SRichard Tran Mills   /* Get the info we need about the permutations and groupings. */
327938d9b04SRichard Tran Mills   iperm   = aijperm->iperm;
328938d9b04SRichard Tran Mills   ngroup  = aijperm->ngroup;
329938d9b04SRichard Tran Mills   xgroup  = aijperm->xgroup;
330938d9b04SRichard Tran Mills   nzgroup = aijperm->nzgroup;
331938d9b04SRichard Tran Mills 
332938d9b04SRichard Tran Mills #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
333938d9b04SRichard Tran Mills   fortranmultaijperm_(&m,x,ii,aj,aa,y);
334938d9b04SRichard Tran Mills #else
335938d9b04SRichard Tran Mills 
336938d9b04SRichard Tran Mills   for (igroup=0; igroup<ngroup; igroup++) {
337938d9b04SRichard Tran Mills     jstart = xgroup[igroup];
338938d9b04SRichard Tran Mills     jend   = xgroup[igroup+1] - 1;
339938d9b04SRichard Tran Mills     nz     = nzgroup[igroup];
340938d9b04SRichard Tran Mills 
341938d9b04SRichard Tran Mills     /* Handle the special cases where the number of nonzeros per row
342938d9b04SRichard Tran Mills      * in the group is either 0 or 1. */
343938d9b04SRichard Tran Mills     if (nz == 0) {
344938d9b04SRichard Tran Mills       for (i=jstart; i<=jend; i++) {
345938d9b04SRichard Tran Mills         y[iperm[i]] = 0.0;
346938d9b04SRichard Tran Mills       }
347938d9b04SRichard Tran Mills     } else if (nz == 1) {
348938d9b04SRichard Tran Mills       for (i=jstart; i<=jend; i++) {
349938d9b04SRichard Tran Mills         iold    = iperm[i];
350938d9b04SRichard Tran Mills         ipos    = ai[iold];
351938d9b04SRichard Tran Mills         y[iold] = aa[ipos] * x[aj[ipos]];
352938d9b04SRichard Tran Mills       }
353938d9b04SRichard Tran Mills     } else {
354938d9b04SRichard Tran Mills 
355938d9b04SRichard Tran Mills       /* We work our way through the current group in chunks of NDIM rows
356938d9b04SRichard Tran Mills        * at a time. */
357938d9b04SRichard Tran Mills 
358938d9b04SRichard Tran Mills       for (istart=jstart; istart<=jend; istart+=NDIM) {
359938d9b04SRichard Tran Mills         /* Figure out where the chunk of 'isize' rows ends in iperm.
360938d9b04SRichard Tran Mills          * 'isize may of course be less than NDIM for the last chunk. */
361938d9b04SRichard Tran Mills         iend = istart + (NDIM - 1);
362938d9b04SRichard Tran Mills 
363938d9b04SRichard Tran Mills         if (iend > jend) iend = jend;
364938d9b04SRichard Tran Mills 
365938d9b04SRichard Tran Mills         isize = iend - istart + 1;
366938d9b04SRichard Tran Mills 
367938d9b04SRichard Tran Mills         /* Initialize the yp[] array that will be used to hold part of
368938d9b04SRichard Tran Mills          * the permuted results vector, and figure out where in aa each
369938d9b04SRichard Tran Mills          * row of the chunk will begin. */
370938d9b04SRichard Tran Mills         for (i=0; i<isize; i++) {
371938d9b04SRichard Tran Mills           iold = iperm[istart + i];
372938d9b04SRichard Tran Mills           /* iold is a row number from the matrix A *before* reordering. */
373938d9b04SRichard Tran Mills           ip[i] = ai[iold];
374938d9b04SRichard Tran Mills           /* ip[i] tells us where the ith row of the chunk begins in aa. */
375938d9b04SRichard Tran Mills           yp[i] = (PetscScalar) 0.0;
376938d9b04SRichard Tran Mills         }
377938d9b04SRichard Tran Mills 
378938d9b04SRichard Tran Mills         /* If the number of zeros per row exceeds the number of rows in
379938d9b04SRichard Tran Mills          * the chunk, we should vectorize along nz, that is, perform the
380938d9b04SRichard Tran Mills          * mat-vec one row at a time as in the usual CSR case. */
381938d9b04SRichard Tran Mills         if (nz > isize) {
382938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
383938d9b04SRichard Tran Mills #pragma _CRI preferstream
384938d9b04SRichard Tran Mills #endif
385938d9b04SRichard Tran Mills           for (i=0; i<isize; i++) {
386938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
387938d9b04SRichard Tran Mills #pragma _CRI prefervector
388938d9b04SRichard Tran Mills #endif
389f67b6f2eSRichard Tran Mills 
39080423c7aSSatish Balay #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
391f67b6f2eSRichard Tran Mills             vec_y = _mm512_setzero_pd();
392f67b6f2eSRichard Tran Mills             ipos = ip[i];
393f67b6f2eSRichard Tran Mills             for (j=0; j<(nz>>3); j++) {
394f67b6f2eSRichard Tran Mills               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
395f67b6f2eSRichard Tran Mills               vec_vals = _mm512_loadu_pd(&aa[ipos]);
396f67b6f2eSRichard Tran Mills               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
397f67b6f2eSRichard Tran Mills               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
398f67b6f2eSRichard Tran Mills               ipos += 8;
399f67b6f2eSRichard Tran Mills             }
400f67b6f2eSRichard Tran Mills             if ((nz&0x07)>2) {
401f67b6f2eSRichard Tran Mills               mask     = (__mmask8)(0xff >> (8-(nz&0x07)));
402f67b6f2eSRichard Tran Mills               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
403f67b6f2eSRichard Tran Mills               vec_vals = _mm512_loadu_pd(&aa[ipos]);
404f67b6f2eSRichard Tran Mills               vec_x    = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8);
405f67b6f2eSRichard Tran Mills               vec_y    = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask);
406f67b6f2eSRichard Tran Mills             } else if ((nz&0x07)==2) {
407f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos]*x[aj[ipos]];
408f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos+1]*x[aj[ipos+1]];
409f67b6f2eSRichard Tran Mills             } else if ((nz&0x07)==1) {
410f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos]*x[aj[ipos]];
411f67b6f2eSRichard Tran Mills             }
412f67b6f2eSRichard Tran Mills             yp[i] += _mm512_reduce_add_pd(vec_y);
413f67b6f2eSRichard Tran Mills #else
414938d9b04SRichard Tran Mills             for (j=0; j<nz; j++) {
415938d9b04SRichard Tran Mills               ipos   = ip[i] + j;
416938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
417938d9b04SRichard Tran Mills             }
418f67b6f2eSRichard Tran Mills #endif
419938d9b04SRichard Tran Mills           }
420938d9b04SRichard Tran Mills         } else {
421938d9b04SRichard Tran Mills           /* Otherwise, there are enough rows in the chunk to make it
422938d9b04SRichard Tran Mills            * worthwhile to vectorize across the rows, that is, to do the
423938d9b04SRichard Tran Mills            * matvec by operating with "columns" of the chunk. */
424938d9b04SRichard Tran Mills           for (j=0; j<nz; j++) {
42580423c7aSSatish Balay #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
426f67b6f2eSRichard Tran Mills             vec_j = _mm256_set1_epi32(j);
427f67b6f2eSRichard Tran Mills             for (i=0; i<((isize>>3)<<3); i+=8) {
428f67b6f2eSRichard Tran Mills               vec_y    = _mm512_loadu_pd(&yp[i]);
429f67b6f2eSRichard Tran Mills               vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]);
430f67b6f2eSRichard Tran Mills               vec_ipos = _mm256_add_epi32(vec_ipos,vec_j);
431f67b6f2eSRichard Tran Mills               vec_idx  = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4);
432f67b6f2eSRichard Tran Mills               vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8);
433f67b6f2eSRichard Tran Mills               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
434f67b6f2eSRichard Tran Mills               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
435f67b6f2eSRichard Tran Mills               _mm512_storeu_pd(&yp[i],vec_y);
436f67b6f2eSRichard Tran Mills             }
437f67b6f2eSRichard Tran Mills             for (i=isize-(isize&0x07); i<isize; i++) {
438f67b6f2eSRichard Tran Mills               ipos = ip[i]+j;
439f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos]*x[aj[ipos]];
440f67b6f2eSRichard Tran Mills             }
441f67b6f2eSRichard Tran Mills #else
442938d9b04SRichard Tran Mills             for (i=0; i<isize; i++) {
443938d9b04SRichard Tran Mills               ipos   = ip[i] + j;
444938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
445938d9b04SRichard Tran Mills             }
446f67b6f2eSRichard Tran Mills #endif
447938d9b04SRichard Tran Mills           }
448938d9b04SRichard Tran Mills         }
449938d9b04SRichard Tran Mills 
450938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
451938d9b04SRichard Tran Mills #pragma _CRI ivdep
452938d9b04SRichard Tran Mills #endif
453938d9b04SRichard Tran Mills         /* Put results from yp[] into non-permuted result vector y. */
454938d9b04SRichard Tran Mills         for (i=0; i<isize; i++) {
455938d9b04SRichard Tran Mills           y[iperm[istart+i]] = yp[i];
456938d9b04SRichard Tran Mills         }
457938d9b04SRichard Tran Mills       } /* End processing chunk of isize rows of a group. */
458938d9b04SRichard Tran Mills     } /* End handling matvec for chunk with nz > 1. */
459938d9b04SRichard Tran Mills   } /* End loop over igroup. */
460938d9b04SRichard Tran Mills #endif
4619566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0)));
4629566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx,&x));
4639566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(yy,&y));
464938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
465938d9b04SRichard Tran Mills }
466938d9b04SRichard Tran Mills 
467938d9b04SRichard Tran Mills /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
468938d9b04SRichard Tran Mills  * Note that the names I used to designate the vectors differs from that
469938d9b04SRichard Tran Mills  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
470938d9b04SRichard Tran Mills  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
471938d9b04SRichard Tran Mills /*
472938d9b04SRichard Tran Mills     I hate having virtually identical code for the mult and the multadd!!!
473938d9b04SRichard Tran Mills */
474938d9b04SRichard Tran Mills PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)
475938d9b04SRichard Tran Mills {
476938d9b04SRichard Tran Mills   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
477938d9b04SRichard Tran Mills   const PetscScalar *x;
478938d9b04SRichard Tran Mills   PetscScalar       *y,*w;
479938d9b04SRichard Tran Mills   const MatScalar   *aa;
480938d9b04SRichard Tran Mills   const PetscInt    *aj,*ai;
481938d9b04SRichard Tran Mills #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
482938d9b04SRichard Tran Mills   PetscInt i,j;
483938d9b04SRichard Tran Mills #endif
484938d9b04SRichard Tran Mills 
485938d9b04SRichard Tran Mills   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
486938d9b04SRichard Tran Mills   Mat_SeqAIJPERM * aijperm;
487938d9b04SRichard Tran Mills   PetscInt       *iperm;    /* Points to the permutation vector. */
488938d9b04SRichard Tran Mills   PetscInt       *xgroup;
489938d9b04SRichard Tran Mills   /* Denotes where groups of rows with same number of nonzeros
490938d9b04SRichard Tran Mills    * begin and end in iperm. */
491938d9b04SRichard Tran Mills   PetscInt *nzgroup;
492938d9b04SRichard Tran Mills   PetscInt ngroup;
493938d9b04SRichard Tran Mills   PetscInt igroup;
494938d9b04SRichard Tran Mills   PetscInt jstart,jend;
495938d9b04SRichard Tran Mills   /* jstart is used in loops to denote the position in iperm where a
496938d9b04SRichard Tran Mills    * group starts; jend denotes the position where it ends.
497938d9b04SRichard Tran Mills    * (jend + 1 is where the next group starts.) */
498938d9b04SRichard Tran Mills   PetscInt    iold,nz;
499938d9b04SRichard Tran Mills   PetscInt    istart,iend,isize;
500938d9b04SRichard Tran Mills   PetscInt    ipos;
501938d9b04SRichard Tran Mills   PetscScalar yp[NDIM];
502938d9b04SRichard Tran Mills   PetscInt    ip[NDIM];
503938d9b04SRichard Tran Mills   /* yp[] and ip[] are treated as vector "registers" for performing
504938d9b04SRichard Tran Mills    * the mat-vec. */
505938d9b04SRichard Tran Mills 
506938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
507938d9b04SRichard Tran Mills #pragma disjoint(*x,*y,*aa)
508938d9b04SRichard Tran Mills #endif
509938d9b04SRichard Tran Mills 
510938d9b04SRichard Tran Mills   PetscFunctionBegin;
5119566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx,&x));
5129566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy,ww,&y,&w));
513938d9b04SRichard Tran Mills 
514938d9b04SRichard Tran Mills   aj = a->j;   /* aj[k] gives column index for element aa[k]. */
515938d9b04SRichard Tran Mills   aa = a->a;   /* Nonzero elements stored row-by-row. */
516938d9b04SRichard Tran Mills   ai = a->i;   /* ai[k] is the position in aa and aj where row k starts. */
517938d9b04SRichard Tran Mills 
518938d9b04SRichard Tran Mills   /* Get the info we need about the permutations and groupings. */
519938d9b04SRichard Tran Mills   aijperm = (Mat_SeqAIJPERM*) A->spptr;
520938d9b04SRichard Tran Mills   iperm   = aijperm->iperm;
521938d9b04SRichard Tran Mills   ngroup  = aijperm->ngroup;
522938d9b04SRichard Tran Mills   xgroup  = aijperm->xgroup;
523938d9b04SRichard Tran Mills   nzgroup = aijperm->nzgroup;
524938d9b04SRichard Tran Mills 
525938d9b04SRichard Tran Mills #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
526938d9b04SRichard Tran Mills   fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w);
527938d9b04SRichard Tran Mills #else
528938d9b04SRichard Tran Mills 
529938d9b04SRichard Tran Mills   for (igroup=0; igroup<ngroup; igroup++) {
530938d9b04SRichard Tran Mills     jstart = xgroup[igroup];
531938d9b04SRichard Tran Mills     jend   = xgroup[igroup+1] - 1;
532938d9b04SRichard Tran Mills 
533938d9b04SRichard Tran Mills     nz = nzgroup[igroup];
534938d9b04SRichard Tran Mills 
535938d9b04SRichard Tran Mills     /* Handle the special cases where the number of nonzeros per row
536938d9b04SRichard Tran Mills      * in the group is either 0 or 1. */
537938d9b04SRichard Tran Mills     if (nz == 0) {
538938d9b04SRichard Tran Mills       for (i=jstart; i<=jend; i++) {
539938d9b04SRichard Tran Mills         iold    = iperm[i];
540938d9b04SRichard Tran Mills         y[iold] = w[iold];
541938d9b04SRichard Tran Mills       }
542938d9b04SRichard Tran Mills     }
543938d9b04SRichard Tran Mills     else if (nz == 1) {
544938d9b04SRichard Tran Mills       for (i=jstart; i<=jend; i++) {
545938d9b04SRichard Tran Mills         iold    = iperm[i];
546938d9b04SRichard Tran Mills         ipos    = ai[iold];
547938d9b04SRichard Tran Mills         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
548938d9b04SRichard Tran Mills       }
549938d9b04SRichard Tran Mills     }
550938d9b04SRichard Tran Mills     /* For the general case: */
551938d9b04SRichard Tran Mills     else {
552938d9b04SRichard Tran Mills 
553938d9b04SRichard Tran Mills       /* We work our way through the current group in chunks of NDIM rows
554938d9b04SRichard Tran Mills        * at a time. */
555938d9b04SRichard Tran Mills 
556938d9b04SRichard Tran Mills       for (istart=jstart; istart<=jend; istart+=NDIM) {
557938d9b04SRichard Tran Mills         /* Figure out where the chunk of 'isize' rows ends in iperm.
558938d9b04SRichard Tran Mills          * 'isize may of course be less than NDIM for the last chunk. */
559938d9b04SRichard Tran Mills         iend = istart + (NDIM - 1);
560938d9b04SRichard Tran Mills         if (iend > jend) iend = jend;
561938d9b04SRichard Tran Mills         isize = iend - istart + 1;
562938d9b04SRichard Tran Mills 
563938d9b04SRichard Tran Mills         /* Initialize the yp[] array that will be used to hold part of
564938d9b04SRichard Tran Mills          * the permuted results vector, and figure out where in aa each
565938d9b04SRichard Tran Mills          * row of the chunk will begin. */
566938d9b04SRichard Tran Mills         for (i=0; i<isize; i++) {
567938d9b04SRichard Tran Mills           iold = iperm[istart + i];
568938d9b04SRichard Tran Mills           /* iold is a row number from the matrix A *before* reordering. */
569938d9b04SRichard Tran Mills           ip[i] = ai[iold];
570938d9b04SRichard Tran Mills           /* ip[i] tells us where the ith row of the chunk begins in aa. */
571938d9b04SRichard Tran Mills           yp[i] = w[iold];
572938d9b04SRichard Tran Mills         }
573938d9b04SRichard Tran Mills 
574938d9b04SRichard Tran Mills         /* If the number of zeros per row exceeds the number of rows in
575938d9b04SRichard Tran Mills          * the chunk, we should vectorize along nz, that is, perform the
576938d9b04SRichard Tran Mills          * mat-vec one row at a time as in the usual CSR case. */
577938d9b04SRichard Tran Mills         if (nz > isize) {
578938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
579938d9b04SRichard Tran Mills #pragma _CRI preferstream
580938d9b04SRichard Tran Mills #endif
581938d9b04SRichard Tran Mills           for (i=0; i<isize; i++) {
582938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
583938d9b04SRichard Tran Mills #pragma _CRI prefervector
584938d9b04SRichard Tran Mills #endif
585938d9b04SRichard Tran Mills             for (j=0; j<nz; j++) {
586938d9b04SRichard Tran Mills               ipos   = ip[i] + j;
587938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
588938d9b04SRichard Tran Mills             }
589938d9b04SRichard Tran Mills           }
590938d9b04SRichard Tran Mills         }
591938d9b04SRichard Tran Mills         /* Otherwise, there are enough rows in the chunk to make it
592938d9b04SRichard Tran Mills          * worthwhile to vectorize across the rows, that is, to do the
593938d9b04SRichard Tran Mills          * matvec by operating with "columns" of the chunk. */
594938d9b04SRichard Tran Mills         else {
595938d9b04SRichard Tran Mills           for (j=0; j<nz; j++) {
596938d9b04SRichard Tran Mills             for (i=0; i<isize; i++) {
597938d9b04SRichard Tran Mills               ipos   = ip[i] + j;
598938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
599938d9b04SRichard Tran Mills             }
600938d9b04SRichard Tran Mills           }
601938d9b04SRichard Tran Mills         }
602938d9b04SRichard Tran Mills 
603938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
604938d9b04SRichard Tran Mills #pragma _CRI ivdep
605938d9b04SRichard Tran Mills #endif
606938d9b04SRichard Tran Mills         /* Put results from yp[] into non-permuted result vector y. */
607938d9b04SRichard Tran Mills         for (i=0; i<isize; i++) {
608938d9b04SRichard Tran Mills           y[iperm[istart+i]] = yp[i];
609938d9b04SRichard Tran Mills         }
610938d9b04SRichard Tran Mills       } /* End processing chunk of isize rows of a group. */
611938d9b04SRichard Tran Mills 
612938d9b04SRichard Tran Mills     } /* End handling matvec for chunk with nz > 1. */
613938d9b04SRichard Tran Mills   } /* End loop over igroup. */
614938d9b04SRichard Tran Mills 
615938d9b04SRichard Tran Mills #endif
6169566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*a->nz));
6179566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx,&x));
6189566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy,ww,&y,&w));
619938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
620938d9b04SRichard Tran Mills }
621938d9b04SRichard Tran Mills 
622938d9b04SRichard Tran Mills /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
623938d9b04SRichard Tran Mills  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
624938d9b04SRichard Tran Mills  * routine, but can also be used to convert an assembled SeqAIJ matrix
625938d9b04SRichard Tran Mills  * into a SeqAIJPERM one. */
626938d9b04SRichard Tran Mills PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat)
627938d9b04SRichard Tran Mills {
628938d9b04SRichard Tran Mills   Mat            B = *newmat;
629938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm;
630938d9b04SRichard Tran Mills   PetscBool      sametype;
631938d9b04SRichard Tran Mills 
632938d9b04SRichard Tran Mills   PetscFunctionBegin;
633938d9b04SRichard Tran Mills   if (reuse == MAT_INITIAL_MATRIX) {
6349566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,&B));
635938d9b04SRichard Tran Mills   }
6369566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A,type,&sametype));
637938d9b04SRichard Tran Mills   if (sametype) PetscFunctionReturn(0);
638938d9b04SRichard Tran Mills 
6399566063dSJacob Faibussowitsch   PetscCall(PetscNewLog(B,&aijperm));
640938d9b04SRichard Tran Mills   B->spptr = (void*) aijperm;
641938d9b04SRichard Tran Mills 
642938d9b04SRichard Tran Mills   /* Set function pointers for methods that we inherit from AIJ but override. */
643938d9b04SRichard Tran Mills   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
644938d9b04SRichard Tran Mills   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
645938d9b04SRichard Tran Mills   B->ops->destroy     = MatDestroy_SeqAIJPERM;
646938d9b04SRichard Tran Mills   B->ops->mult        = MatMult_SeqAIJPERM;
647938d9b04SRichard Tran Mills   B->ops->multadd     = MatMultAdd_SeqAIJPERM;
648938d9b04SRichard Tran Mills 
649938d9b04SRichard Tran Mills   aijperm->nonzerostate = -1;  /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
650938d9b04SRichard Tran Mills   /* If A has already been assembled, compute the permutation. */
651938d9b04SRichard Tran Mills   if (A->assembled) {
6529566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJPERM_create_perm(B));
653938d9b04SRichard Tran Mills   }
654938d9b04SRichard Tran Mills 
6559566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ));
656938d9b04SRichard Tran Mills 
6579566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM));
658938d9b04SRichard Tran Mills   *newmat = B;
659938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
660938d9b04SRichard Tran Mills }
661938d9b04SRichard Tran Mills 
662938d9b04SRichard Tran Mills /*@C
663938d9b04SRichard Tran Mills    MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM.
664938d9b04SRichard Tran Mills    This type inherits from AIJ, but calculates some additional permutation
665938d9b04SRichard Tran Mills    information that is used to allow better vectorization of some
666938d9b04SRichard Tran Mills    operations.  At the cost of increased storage, the AIJ formatted
667938d9b04SRichard Tran Mills    matrix can be copied to a format in which pieces of the matrix are
668938d9b04SRichard Tran Mills    stored in ELLPACK format, allowing the vectorized matrix multiply
669938d9b04SRichard Tran Mills    routine to use stride-1 memory accesses.  As with the AIJ type, it is
670938d9b04SRichard Tran Mills    important to preallocate matrix storage in order to get good assembly
671938d9b04SRichard Tran Mills    performance.
672938d9b04SRichard Tran Mills 
673d083f849SBarry Smith    Collective
674938d9b04SRichard Tran Mills 
675938d9b04SRichard Tran Mills    Input Parameters:
676938d9b04SRichard Tran Mills +  comm - MPI communicator, set to PETSC_COMM_SELF
677938d9b04SRichard Tran Mills .  m - number of rows
678938d9b04SRichard Tran Mills .  n - number of columns
679938d9b04SRichard Tran Mills .  nz - number of nonzeros per row (same for all rows)
680938d9b04SRichard Tran Mills -  nnz - array containing the number of nonzeros in the various rows
681938d9b04SRichard Tran Mills          (possibly different for each row) or NULL
682938d9b04SRichard Tran Mills 
683938d9b04SRichard Tran Mills    Output Parameter:
684938d9b04SRichard Tran Mills .  A - the matrix
685938d9b04SRichard Tran Mills 
686938d9b04SRichard Tran Mills    Notes:
687938d9b04SRichard Tran Mills    If nnz is given then nz is ignored
688938d9b04SRichard Tran Mills 
689938d9b04SRichard Tran Mills    Level: intermediate
690938d9b04SRichard Tran Mills 
691938d9b04SRichard Tran Mills .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues()
692938d9b04SRichard Tran Mills @*/
693938d9b04SRichard Tran Mills PetscErrorCode  MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
694938d9b04SRichard Tran Mills {
695938d9b04SRichard Tran Mills   PetscFunctionBegin;
6969566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm,A));
6979566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A,m,n,m,n));
6989566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A,MATSEQAIJPERM));
6999566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz));
700938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
701938d9b04SRichard Tran Mills }
702938d9b04SRichard Tran Mills 
703938d9b04SRichard Tran Mills PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
704938d9b04SRichard Tran Mills {
705938d9b04SRichard Tran Mills   PetscFunctionBegin;
7069566063dSJacob Faibussowitsch   PetscCall(MatSetType(A,MATSEQAIJ));
7079566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A));
708938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
709938d9b04SRichard Tran Mills }
710