xref: /petsc/src/mat/impls/aij/seq/aijperm/aijperm.c (revision 1cc06b555e92f8ec64db10330b8bbd830e5bc876)
1938d9b04SRichard Tran Mills 
2938d9b04SRichard Tran Mills /*
3938d9b04SRichard Tran Mills   Defines basic operations for the MATSEQAIJPERM matrix class.
4938d9b04SRichard Tran Mills   This class is derived from the MATSEQAIJ class and retains the
5938d9b04SRichard Tran Mills   compressed row storage (aka Yale sparse matrix format) but augments
6938d9b04SRichard Tran Mills   it with some permutation information that enables some operations
7938d9b04SRichard Tran Mills   to be more vectorizable.  A physically rearranged copy of the matrix
8938d9b04SRichard Tran Mills   may be stored if the user desires.
9938d9b04SRichard Tran Mills 
10938d9b04SRichard Tran Mills   Eventually a variety of permutations may be supported.
11938d9b04SRichard Tran Mills */
12938d9b04SRichard Tran Mills 
13938d9b04SRichard Tran Mills #include <../src/mat/impls/aij/seq/aij.h>
14938d9b04SRichard Tran Mills 
1580423c7aSSatish Balay #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16f67b6f2eSRichard Tran Mills   #include <immintrin.h>
17f67b6f2eSRichard Tran Mills 
18f67b6f2eSRichard Tran Mills   #if !defined(_MM_SCALE_8)
19f67b6f2eSRichard Tran Mills     #define _MM_SCALE_8 8
20f67b6f2eSRichard Tran Mills   #endif
21f67b6f2eSRichard Tran Mills   #if !defined(_MM_SCALE_4)
22f67b6f2eSRichard Tran Mills     #define _MM_SCALE_4 4
23f67b6f2eSRichard Tran Mills   #endif
24f67b6f2eSRichard Tran Mills #endif
25f67b6f2eSRichard Tran Mills 
26938d9b04SRichard Tran Mills #define NDIM 512
27938d9b04SRichard Tran Mills /* NDIM specifies how many rows at a time we should work with when
28938d9b04SRichard Tran Mills  * performing the vectorized mat-vec.  This depends on various factors
29938d9b04SRichard Tran Mills  * such as vector register length, etc., and I really need to add a
30938d9b04SRichard Tran Mills  * way for the user (or the library) to tune this.  I'm setting it to
31938d9b04SRichard Tran Mills  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32938d9b04SRichard Tran Mills  * routines. */
33938d9b04SRichard Tran Mills 
34938d9b04SRichard Tran Mills typedef struct {
35938d9b04SRichard Tran Mills   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
36938d9b04SRichard Tran Mills 
37938d9b04SRichard Tran Mills   PetscInt  ngroup;
38938d9b04SRichard Tran Mills   PetscInt *xgroup;
39938d9b04SRichard Tran Mills   /* Denotes where groups of rows with same number of nonzeros
40938d9b04SRichard Tran Mills    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41938d9b04SRichard Tran Mills    * where the ith group begins. */
42938d9b04SRichard Tran Mills 
43938d9b04SRichard Tran Mills   PetscInt *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
44938d9b04SRichard Tran Mills   PetscInt *iperm;   /* The permutation vector. */
45938d9b04SRichard Tran Mills 
46938d9b04SRichard Tran Mills   /* Some of this stuff is for Ed's recursive triangular solve.
47938d9b04SRichard Tran Mills    * I'm not sure what I need yet. */
48938d9b04SRichard Tran Mills   PetscInt   blocksize;
49938d9b04SRichard Tran Mills   PetscInt   nstep;
50938d9b04SRichard Tran Mills   PetscInt  *jstart_list;
51938d9b04SRichard Tran Mills   PetscInt  *jend_list;
52938d9b04SRichard Tran Mills   PetscInt  *action_list;
53938d9b04SRichard Tran Mills   PetscInt  *ngroup_list;
54938d9b04SRichard Tran Mills   PetscInt **ipointer_list;
55938d9b04SRichard Tran Mills   PetscInt **xgroup_list;
56938d9b04SRichard Tran Mills   PetscInt **nzgroup_list;
57938d9b04SRichard Tran Mills   PetscInt **iperm_list;
58938d9b04SRichard Tran Mills } Mat_SeqAIJPERM;
59938d9b04SRichard Tran Mills 
60d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat)
61d71ae5a4SJacob Faibussowitsch {
62938d9b04SRichard Tran Mills   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
63938d9b04SRichard Tran Mills   /* so we will ignore 'MatType type'. */
64938d9b04SRichard Tran Mills   Mat             B       = *newmat;
65938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
66938d9b04SRichard Tran Mills 
67938d9b04SRichard Tran Mills   PetscFunctionBegin;
68938d9b04SRichard Tran Mills   if (reuse == MAT_INITIAL_MATRIX) {
699566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
70938d9b04SRichard Tran Mills     aijperm = (Mat_SeqAIJPERM *)B->spptr;
71938d9b04SRichard Tran Mills   }
72938d9b04SRichard Tran Mills 
73938d9b04SRichard Tran Mills   /* Reset the original function pointers. */
74938d9b04SRichard Tran Mills   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
75938d9b04SRichard Tran Mills   B->ops->destroy     = MatDestroy_SeqAIJ;
76938d9b04SRichard Tran Mills   B->ops->duplicate   = MatDuplicate_SeqAIJ;
77938d9b04SRichard Tran Mills   B->ops->mult        = MatMult_SeqAIJ;
78938d9b04SRichard Tran Mills   B->ops->multadd     = MatMultAdd_SeqAIJ;
79938d9b04SRichard Tran Mills 
809566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", NULL));
81938d9b04SRichard Tran Mills 
82938d9b04SRichard Tran Mills   /* Free everything in the Mat_SeqAIJPERM data structure.*/
839566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->xgroup));
849566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->nzgroup));
859566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->iperm));
869566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->spptr));
87938d9b04SRichard Tran Mills 
88938d9b04SRichard Tran Mills   /* Change the type of B to MATSEQAIJ. */
899566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ));
90938d9b04SRichard Tran Mills 
91938d9b04SRichard Tran Mills   *newmat = B;
923ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
93938d9b04SRichard Tran Mills }
94938d9b04SRichard Tran Mills 
95d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
96d71ae5a4SJacob Faibussowitsch {
97938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
98938d9b04SRichard Tran Mills 
99938d9b04SRichard Tran Mills   PetscFunctionBegin;
100938d9b04SRichard Tran Mills   if (aijperm) {
101938d9b04SRichard Tran Mills     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
1029566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm->xgroup));
1039566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm->nzgroup));
1049566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm->iperm));
1059566063dSJacob Faibussowitsch     PetscCall(PetscFree(A->spptr));
106938d9b04SRichard Tran Mills   }
107938d9b04SRichard Tran Mills   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
108938d9b04SRichard Tran Mills    * to destroy everything that remains. */
1099566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ));
110938d9b04SRichard Tran Mills   /* Note that I don't call MatSetType().  I believe this is because that
111938d9b04SRichard Tran Mills    * is only to be called when *building* a matrix.  I could be wrong, but
112938d9b04SRichard Tran Mills    * that is how things work for the SuperLU matrix class. */
1132e956fe4SStefano Zampini   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijperm_seqaij_C", NULL));
1149566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
1153ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
116938d9b04SRichard Tran Mills }
117938d9b04SRichard Tran Mills 
118d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
119d71ae5a4SJacob Faibussowitsch {
120938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
121938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm_dest;
122938d9b04SRichard Tran Mills   PetscBool       perm;
123938d9b04SRichard Tran Mills 
124938d9b04SRichard Tran Mills   PetscFunctionBegin;
1259566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, op, M));
1269566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)*M, MATSEQAIJPERM, &perm));
127938d9b04SRichard Tran Mills   if (perm) {
128938d9b04SRichard Tran Mills     aijperm_dest = (Mat_SeqAIJPERM *)(*M)->spptr;
1299566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm_dest->xgroup));
1309566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm_dest->nzgroup));
1319566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm_dest->iperm));
132938d9b04SRichard Tran Mills   } else {
1334dfa11a4SJacob Faibussowitsch     PetscCall(PetscNew(&aijperm_dest));
134938d9b04SRichard Tran Mills     (*M)->spptr = (void *)aijperm_dest;
1359566063dSJacob Faibussowitsch     PetscCall(PetscObjectChangeTypeName((PetscObject)*M, MATSEQAIJPERM));
1369566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)*M, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
137938d9b04SRichard Tran Mills   }
1389566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(aijperm_dest, aijperm, 1));
139938d9b04SRichard Tran Mills   /* Allocate space for, and copy the grouping and permutation info.
140938d9b04SRichard Tran Mills    * I note that when the groups are initially determined in
141938d9b04SRichard Tran Mills    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
142938d9b04SRichard Tran Mills    * necessary.  But at this point, we know how large they need to be, and
143938d9b04SRichard Tran Mills    * allocate only the necessary amount of memory.  So the duplicated matrix
144938d9b04SRichard Tran Mills    * may actually use slightly less storage than the original! */
1459566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm));
1469566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(aijperm->ngroup + 1, &aijperm_dest->xgroup));
1479566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup));
1489566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(aijperm_dest->iperm, aijperm->iperm, A->rmap->n));
1499566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(aijperm_dest->xgroup, aijperm->xgroup, aijperm->ngroup + 1));
1509566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(aijperm_dest->nzgroup, aijperm->nzgroup, aijperm->ngroup));
1513ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
152938d9b04SRichard Tran Mills }
153938d9b04SRichard Tran Mills 
154d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
155d71ae5a4SJacob Faibussowitsch {
156938d9b04SRichard Tran Mills   Mat_SeqAIJ     *a       = (Mat_SeqAIJ *)(A)->data;
157938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
158938d9b04SRichard Tran Mills   PetscInt        m;     /* Number of rows in the matrix. */
159938d9b04SRichard Tran Mills   PetscInt       *ia;    /* From the CSR representation; points to the beginning  of each row. */
160938d9b04SRichard Tran Mills   PetscInt        maxnz; /* Maximum number of nonzeros in any row. */
161938d9b04SRichard Tran Mills   PetscInt       *rows_in_bucket;
162938d9b04SRichard Tran Mills   /* To construct the permutation, we sort each row into one of maxnz
163938d9b04SRichard Tran Mills    * buckets based on how many nonzeros are in the row. */
164938d9b04SRichard Tran Mills   PetscInt  nz;
165938d9b04SRichard Tran Mills   PetscInt *nz_in_row; /* the number of nonzero elements in row k. */
166938d9b04SRichard Tran Mills   PetscInt *ipnz;
167938d9b04SRichard Tran Mills   /* When constructing the iperm permutation vector,
168938d9b04SRichard Tran Mills    * ipnz[nz] is used to point to the next place in the permutation vector
169938d9b04SRichard Tran Mills    * that a row with nz nonzero elements should be placed.*/
170938d9b04SRichard Tran Mills   PetscInt i, ngroup, istart, ipos;
171938d9b04SRichard Tran Mills 
172938d9b04SRichard Tran Mills   PetscFunctionBegin;
1733ba16761SJacob Faibussowitsch   if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); /* permutation exists and matches current nonzero structure */
174938d9b04SRichard Tran Mills   aijperm->nonzerostate = A->nonzerostate;
175938d9b04SRichard Tran Mills   /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
1769566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->xgroup));
1779566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->nzgroup));
1789566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->iperm));
179938d9b04SRichard Tran Mills 
180938d9b04SRichard Tran Mills   m  = A->rmap->n;
181938d9b04SRichard Tran Mills   ia = a->i;
182938d9b04SRichard Tran Mills 
183938d9b04SRichard Tran Mills   /* Allocate the arrays that will hold the permutation vector. */
1849566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aijperm->iperm));
185938d9b04SRichard Tran Mills 
186938d9b04SRichard Tran Mills   /* Allocate some temporary work arrays that will be used in
1876aad120cSJose E. Roman    * calculating the permutation vector and groupings. */
1889566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &nz_in_row));
189938d9b04SRichard Tran Mills 
190938d9b04SRichard Tran Mills   /* Now actually figure out the permutation and grouping. */
191938d9b04SRichard Tran Mills 
192938d9b04SRichard Tran Mills   /* First pass: Determine number of nonzeros in each row, maximum
193938d9b04SRichard Tran Mills    * number of nonzeros in any row, and how many rows fall into each
194938d9b04SRichard Tran Mills    * "bucket" of rows with same number of nonzeros. */
195938d9b04SRichard Tran Mills   maxnz = 0;
196938d9b04SRichard Tran Mills   for (i = 0; i < m; i++) {
197938d9b04SRichard Tran Mills     nz_in_row[i] = ia[i + 1] - ia[i];
198938d9b04SRichard Tran Mills     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
199938d9b04SRichard Tran Mills   }
2009566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &rows_in_bucket));
2019566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &ipnz));
202938d9b04SRichard Tran Mills 
203ad540459SPierre Jolivet   for (i = 0; i <= maxnz; i++) rows_in_bucket[i] = 0;
204938d9b04SRichard Tran Mills   for (i = 0; i < m; i++) {
205938d9b04SRichard Tran Mills     nz = nz_in_row[i];
206938d9b04SRichard Tran Mills     rows_in_bucket[nz]++;
207938d9b04SRichard Tran Mills   }
208938d9b04SRichard Tran Mills 
209938d9b04SRichard Tran Mills   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
210938d9b04SRichard Tran Mills    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
211938d9b04SRichard Tran Mills    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
212938d9b04SRichard Tran Mills    * then xgroup[] must consist of (maxnz + 2) elements, since the last
213938d9b04SRichard Tran Mills    * element of xgroup will tell us where the (maxnz + 1)th group ends.
214938d9b04SRichard Tran Mills    * We allocate space for the maximum number of groups;
215938d9b04SRichard Tran Mills    * that is potentially a little wasteful, but not too much so.
216938d9b04SRichard Tran Mills    * Perhaps I should fix it later. */
2179566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(maxnz + 2, &aijperm->xgroup));
2189566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(maxnz + 1, &aijperm->nzgroup));
219938d9b04SRichard Tran Mills 
220938d9b04SRichard Tran Mills   /* Second pass.  Look at what is in the buckets and create the groupings.
221938d9b04SRichard Tran Mills    * Note that it is OK to have a group of rows with no non-zero values. */
222938d9b04SRichard Tran Mills   ngroup = 0;
223938d9b04SRichard Tran Mills   istart = 0;
224938d9b04SRichard Tran Mills   for (i = 0; i <= maxnz; i++) {
225938d9b04SRichard Tran Mills     if (rows_in_bucket[i] > 0) {
226938d9b04SRichard Tran Mills       aijperm->nzgroup[ngroup] = i;
227938d9b04SRichard Tran Mills       aijperm->xgroup[ngroup]  = istart;
228938d9b04SRichard Tran Mills       ngroup++;
229938d9b04SRichard Tran Mills       istart += rows_in_bucket[i];
230938d9b04SRichard Tran Mills     }
231938d9b04SRichard Tran Mills   }
232938d9b04SRichard Tran Mills 
233938d9b04SRichard Tran Mills   aijperm->xgroup[ngroup] = istart;
234938d9b04SRichard Tran Mills   aijperm->ngroup         = ngroup;
235938d9b04SRichard Tran Mills 
236938d9b04SRichard Tran Mills   /* Now fill in the permutation vector iperm. */
237938d9b04SRichard Tran Mills   ipnz[0] = 0;
238ad540459SPierre Jolivet   for (i = 0; i < maxnz; i++) ipnz[i + 1] = ipnz[i] + rows_in_bucket[i];
239938d9b04SRichard Tran Mills 
240938d9b04SRichard Tran Mills   for (i = 0; i < m; i++) {
241938d9b04SRichard Tran Mills     nz                   = nz_in_row[i];
242938d9b04SRichard Tran Mills     ipos                 = ipnz[nz];
243938d9b04SRichard Tran Mills     aijperm->iperm[ipos] = i;
244938d9b04SRichard Tran Mills     ipnz[nz]++;
245938d9b04SRichard Tran Mills   }
246938d9b04SRichard Tran Mills 
247938d9b04SRichard Tran Mills   /* Clean up temporary work arrays. */
2489566063dSJacob Faibussowitsch   PetscCall(PetscFree(rows_in_bucket));
2499566063dSJacob Faibussowitsch   PetscCall(PetscFree(ipnz));
2509566063dSJacob Faibussowitsch   PetscCall(PetscFree(nz_in_row));
2513ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
252938d9b04SRichard Tran Mills }
253938d9b04SRichard Tran Mills 
254d71ae5a4SJacob Faibussowitsch PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
255d71ae5a4SJacob Faibussowitsch {
256938d9b04SRichard Tran Mills   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
257938d9b04SRichard Tran Mills 
258938d9b04SRichard Tran Mills   PetscFunctionBegin;
2593ba16761SJacob Faibussowitsch   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
260938d9b04SRichard Tran Mills 
261938d9b04SRichard Tran Mills   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
262938d9b04SRichard Tran Mills    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
263938d9b04SRichard Tran Mills    * I'm not sure if this is the best way to do this, but it avoids
264938d9b04SRichard Tran Mills    * a lot of code duplication.
265938d9b04SRichard Tran Mills    * I also note that currently MATSEQAIJPERM doesn't know anything about
266938d9b04SRichard Tran Mills    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
267938d9b04SRichard Tran Mills    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
268938d9b04SRichard Tran Mills    * this, this may break things.  (Don't know... haven't looked at it.) */
269938d9b04SRichard Tran Mills   a->inode.use = PETSC_FALSE;
2709566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
271938d9b04SRichard Tran Mills 
272938d9b04SRichard Tran Mills   /* Now calculate the permutation and grouping information. */
2739566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJPERM_create_perm(A));
2743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
275938d9b04SRichard Tran Mills }
276938d9b04SRichard Tran Mills 
277d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqAIJPERM(Mat A, Vec xx, Vec yy)
278d71ae5a4SJacob Faibussowitsch {
279938d9b04SRichard Tran Mills   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
280938d9b04SRichard Tran Mills   const PetscScalar *x;
281938d9b04SRichard Tran Mills   PetscScalar       *y;
282938d9b04SRichard Tran Mills   const MatScalar   *aa;
283938d9b04SRichard Tran Mills   const PetscInt    *aj, *ai;
284938d9b04SRichard Tran Mills #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
285938d9b04SRichard Tran Mills   PetscInt i, j;
286938d9b04SRichard Tran Mills #endif
28780423c7aSSatish Balay #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
288f67b6f2eSRichard Tran Mills   __m512d  vec_x, vec_y, vec_vals;
289f67b6f2eSRichard Tran Mills   __m256i  vec_idx, vec_ipos, vec_j;
290f67b6f2eSRichard Tran Mills   __mmask8 mask;
291f67b6f2eSRichard Tran Mills #endif
292938d9b04SRichard Tran Mills 
293938d9b04SRichard Tran Mills   /* Variables that don't appear in MatMult_SeqAIJ. */
294938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
295938d9b04SRichard Tran Mills   PetscInt       *iperm; /* Points to the permutation vector. */
296938d9b04SRichard Tran Mills   PetscInt       *xgroup;
297938d9b04SRichard Tran Mills   /* Denotes where groups of rows with same number of nonzeros
298938d9b04SRichard Tran Mills    * begin and end in iperm. */
299938d9b04SRichard Tran Mills   PetscInt *nzgroup;
300938d9b04SRichard Tran Mills   PetscInt  ngroup;
301938d9b04SRichard Tran Mills   PetscInt  igroup;
302938d9b04SRichard Tran Mills   PetscInt  jstart, jend;
303938d9b04SRichard Tran Mills   /* jstart is used in loops to denote the position in iperm where a
304938d9b04SRichard Tran Mills    * group starts; jend denotes the position where it ends.
305938d9b04SRichard Tran Mills    * (jend + 1 is where the next group starts.) */
306938d9b04SRichard Tran Mills   PetscInt    iold, nz;
307938d9b04SRichard Tran Mills   PetscInt    istart, iend, isize;
308938d9b04SRichard Tran Mills   PetscInt    ipos;
309938d9b04SRichard Tran Mills   PetscScalar yp[NDIM];
310938d9b04SRichard Tran Mills   PetscInt    ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
311938d9b04SRichard Tran Mills 
312938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
313938d9b04SRichard Tran Mills   #pragma disjoint(*x, *y, *aa)
314938d9b04SRichard Tran Mills #endif
315938d9b04SRichard Tran Mills 
316938d9b04SRichard Tran Mills   PetscFunctionBegin;
3179566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3189566063dSJacob Faibussowitsch   PetscCall(VecGetArray(yy, &y));
319938d9b04SRichard Tran Mills   aj = a->j; /* aj[k] gives column index for element aa[k]. */
320938d9b04SRichard Tran Mills   aa = a->a; /* Nonzero elements stored row-by-row. */
321938d9b04SRichard Tran Mills   ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
322938d9b04SRichard Tran Mills 
323938d9b04SRichard Tran Mills   /* Get the info we need about the permutations and groupings. */
324938d9b04SRichard Tran Mills   iperm   = aijperm->iperm;
325938d9b04SRichard Tran Mills   ngroup  = aijperm->ngroup;
326938d9b04SRichard Tran Mills   xgroup  = aijperm->xgroup;
327938d9b04SRichard Tran Mills   nzgroup = aijperm->nzgroup;
328938d9b04SRichard Tran Mills 
329938d9b04SRichard Tran Mills #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
330938d9b04SRichard Tran Mills   fortranmultaijperm_(&m, x, ii, aj, aa, y);
331938d9b04SRichard Tran Mills #else
332938d9b04SRichard Tran Mills 
333938d9b04SRichard Tran Mills   for (igroup = 0; igroup < ngroup; igroup++) {
334938d9b04SRichard Tran Mills     jstart = xgroup[igroup];
335938d9b04SRichard Tran Mills     jend   = xgroup[igroup + 1] - 1;
336938d9b04SRichard Tran Mills     nz     = nzgroup[igroup];
337938d9b04SRichard Tran Mills 
338938d9b04SRichard Tran Mills     /* Handle the special cases where the number of nonzeros per row
339938d9b04SRichard Tran Mills      * in the group is either 0 or 1. */
340938d9b04SRichard Tran Mills     if (nz == 0) {
341ad540459SPierre Jolivet       for (i = jstart; i <= jend; i++) y[iperm[i]] = 0.0;
342938d9b04SRichard Tran Mills     } else if (nz == 1) {
343938d9b04SRichard Tran Mills       for (i = jstart; i <= jend; i++) {
344938d9b04SRichard Tran Mills         iold    = iperm[i];
345938d9b04SRichard Tran Mills         ipos    = ai[iold];
346938d9b04SRichard Tran Mills         y[iold] = aa[ipos] * x[aj[ipos]];
347938d9b04SRichard Tran Mills       }
348938d9b04SRichard Tran Mills     } else {
349938d9b04SRichard Tran Mills       /* We work our way through the current group in chunks of NDIM rows
350938d9b04SRichard Tran Mills        * at a time. */
351938d9b04SRichard Tran Mills 
352938d9b04SRichard Tran Mills       for (istart = jstart; istart <= jend; istart += NDIM) {
353938d9b04SRichard Tran Mills         /* Figure out where the chunk of 'isize' rows ends in iperm.
354938d9b04SRichard Tran Mills          * 'isize may of course be less than NDIM for the last chunk. */
355938d9b04SRichard Tran Mills         iend = istart + (NDIM - 1);
356938d9b04SRichard Tran Mills 
357938d9b04SRichard Tran Mills         if (iend > jend) iend = jend;
358938d9b04SRichard Tran Mills 
359938d9b04SRichard Tran Mills         isize = iend - istart + 1;
360938d9b04SRichard Tran Mills 
361938d9b04SRichard Tran Mills         /* Initialize the yp[] array that will be used to hold part of
362938d9b04SRichard Tran Mills          * the permuted results vector, and figure out where in aa each
363938d9b04SRichard Tran Mills          * row of the chunk will begin. */
364938d9b04SRichard Tran Mills         for (i = 0; i < isize; i++) {
365938d9b04SRichard Tran Mills           iold = iperm[istart + i];
366938d9b04SRichard Tran Mills           /* iold is a row number from the matrix A *before* reordering. */
367938d9b04SRichard Tran Mills           ip[i] = ai[iold];
368938d9b04SRichard Tran Mills           /* ip[i] tells us where the ith row of the chunk begins in aa. */
369938d9b04SRichard Tran Mills           yp[i] = (PetscScalar)0.0;
370938d9b04SRichard Tran Mills         }
371938d9b04SRichard Tran Mills 
372938d9b04SRichard Tran Mills         /* If the number of zeros per row exceeds the number of rows in
373938d9b04SRichard Tran Mills          * the chunk, we should vectorize along nz, that is, perform the
374938d9b04SRichard Tran Mills          * mat-vec one row at a time as in the usual CSR case. */
375938d9b04SRichard Tran Mills         if (nz > isize) {
376938d9b04SRichard Tran Mills   #if defined(PETSC_HAVE_CRAY_VECTOR)
377938d9b04SRichard Tran Mills     #pragma _CRI preferstream
378938d9b04SRichard Tran Mills   #endif
379938d9b04SRichard Tran Mills           for (i = 0; i < isize; i++) {
380938d9b04SRichard Tran Mills   #if defined(PETSC_HAVE_CRAY_VECTOR)
381938d9b04SRichard Tran Mills     #pragma _CRI prefervector
382938d9b04SRichard Tran Mills   #endif
383f67b6f2eSRichard Tran Mills 
38480423c7aSSatish Balay   #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
385f67b6f2eSRichard Tran Mills             vec_y = _mm512_setzero_pd();
386f67b6f2eSRichard Tran Mills             ipos  = ip[i];
387f67b6f2eSRichard Tran Mills             for (j = 0; j < (nz >> 3); j++) {
388f67b6f2eSRichard Tran Mills               vec_idx  = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
389f67b6f2eSRichard Tran Mills               vec_vals = _mm512_loadu_pd(&aa[ipos]);
390f67b6f2eSRichard Tran Mills               vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
391f67b6f2eSRichard Tran Mills               vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
392f67b6f2eSRichard Tran Mills               ipos += 8;
393f67b6f2eSRichard Tran Mills             }
394f67b6f2eSRichard Tran Mills             if ((nz & 0x07) > 2) {
395f67b6f2eSRichard Tran Mills               mask     = (__mmask8)(0xff >> (8 - (nz & 0x07)));
396f67b6f2eSRichard Tran Mills               vec_idx  = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
397f67b6f2eSRichard Tran Mills               vec_vals = _mm512_loadu_pd(&aa[ipos]);
398f67b6f2eSRichard Tran Mills               vec_x    = _mm512_mask_i32gather_pd(vec_x, mask, vec_idx, x, _MM_SCALE_8);
399f67b6f2eSRichard Tran Mills               vec_y    = _mm512_mask3_fmadd_pd(vec_x, vec_vals, vec_y, mask);
400f67b6f2eSRichard Tran Mills             } else if ((nz & 0x07) == 2) {
401f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
402f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos + 1] * x[aj[ipos + 1]];
403f67b6f2eSRichard Tran Mills             } else if ((nz & 0x07) == 1) {
404f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
405f67b6f2eSRichard Tran Mills             }
406f67b6f2eSRichard Tran Mills             yp[i] += _mm512_reduce_add_pd(vec_y);
407f67b6f2eSRichard Tran Mills   #else
408938d9b04SRichard Tran Mills             for (j = 0; j < nz; j++) {
409938d9b04SRichard Tran Mills               ipos = ip[i] + j;
410938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
411938d9b04SRichard Tran Mills             }
412f67b6f2eSRichard Tran Mills   #endif
413938d9b04SRichard Tran Mills           }
414938d9b04SRichard Tran Mills         } else {
415938d9b04SRichard Tran Mills           /* Otherwise, there are enough rows in the chunk to make it
416938d9b04SRichard Tran Mills            * worthwhile to vectorize across the rows, that is, to do the
417938d9b04SRichard Tran Mills            * matvec by operating with "columns" of the chunk. */
418938d9b04SRichard Tran Mills           for (j = 0; j < nz; j++) {
41980423c7aSSatish Balay   #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
420f67b6f2eSRichard Tran Mills             vec_j = _mm256_set1_epi32(j);
421f67b6f2eSRichard Tran Mills             for (i = 0; i < ((isize >> 3) << 3); i += 8) {
422f67b6f2eSRichard Tran Mills               vec_y    = _mm512_loadu_pd(&yp[i]);
423f67b6f2eSRichard Tran Mills               vec_ipos = _mm256_loadu_si256((__m256i const *)&ip[i]);
424f67b6f2eSRichard Tran Mills               vec_ipos = _mm256_add_epi32(vec_ipos, vec_j);
425f67b6f2eSRichard Tran Mills               vec_idx  = _mm256_i32gather_epi32(aj, vec_ipos, _MM_SCALE_4);
426f67b6f2eSRichard Tran Mills               vec_vals = _mm512_i32gather_pd(vec_ipos, aa, _MM_SCALE_8);
427f67b6f2eSRichard Tran Mills               vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
428f67b6f2eSRichard Tran Mills               vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
429f67b6f2eSRichard Tran Mills               _mm512_storeu_pd(&yp[i], vec_y);
430f67b6f2eSRichard Tran Mills             }
431f67b6f2eSRichard Tran Mills             for (i = isize - (isize & 0x07); i < isize; i++) {
432f67b6f2eSRichard Tran Mills               ipos = ip[i] + j;
433f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
434f67b6f2eSRichard Tran Mills             }
435f67b6f2eSRichard Tran Mills   #else
436938d9b04SRichard Tran Mills             for (i = 0; i < isize; i++) {
437938d9b04SRichard Tran Mills               ipos = ip[i] + j;
438938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
439938d9b04SRichard Tran Mills             }
440f67b6f2eSRichard Tran Mills   #endif
441938d9b04SRichard Tran Mills           }
442938d9b04SRichard Tran Mills         }
443938d9b04SRichard Tran Mills 
444938d9b04SRichard Tran Mills   #if defined(PETSC_HAVE_CRAY_VECTOR)
445938d9b04SRichard Tran Mills     #pragma _CRI ivdep
446938d9b04SRichard Tran Mills   #endif
447938d9b04SRichard Tran Mills         /* Put results from yp[] into non-permuted result vector y. */
448ad540459SPierre Jolivet         for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
449938d9b04SRichard Tran Mills       } /* End processing chunk of isize rows of a group. */
450938d9b04SRichard Tran Mills     }   /* End handling matvec for chunk with nz > 1. */
451938d9b04SRichard Tran Mills   }     /* End loop over igroup. */
452938d9b04SRichard Tran Mills #endif
4539566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(PetscMax(2.0 * a->nz - A->rmap->n, 0)));
4549566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4559566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(yy, &y));
4563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
457938d9b04SRichard Tran Mills }
458938d9b04SRichard Tran Mills 
459938d9b04SRichard Tran Mills /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
460938d9b04SRichard Tran Mills  * Note that the names I used to designate the vectors differs from that
461938d9b04SRichard Tran Mills  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
462938d9b04SRichard Tran Mills  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
463938d9b04SRichard Tran Mills /*
464938d9b04SRichard Tran Mills     I hate having virtually identical code for the mult and the multadd!!!
465938d9b04SRichard Tran Mills */
466d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A, Vec xx, Vec ww, Vec yy)
467d71ae5a4SJacob Faibussowitsch {
468938d9b04SRichard Tran Mills   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
469938d9b04SRichard Tran Mills   const PetscScalar *x;
470938d9b04SRichard Tran Mills   PetscScalar       *y, *w;
471938d9b04SRichard Tran Mills   const MatScalar   *aa;
472938d9b04SRichard Tran Mills   const PetscInt    *aj, *ai;
473938d9b04SRichard Tran Mills #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
474938d9b04SRichard Tran Mills   PetscInt i, j;
475938d9b04SRichard Tran Mills #endif
476938d9b04SRichard Tran Mills 
477938d9b04SRichard Tran Mills   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
478938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm;
479938d9b04SRichard Tran Mills   PetscInt       *iperm; /* Points to the permutation vector. */
480938d9b04SRichard Tran Mills   PetscInt       *xgroup;
481938d9b04SRichard Tran Mills   /* Denotes where groups of rows with same number of nonzeros
482938d9b04SRichard Tran Mills    * begin and end in iperm. */
483938d9b04SRichard Tran Mills   PetscInt *nzgroup;
484938d9b04SRichard Tran Mills   PetscInt  ngroup;
485938d9b04SRichard Tran Mills   PetscInt  igroup;
486938d9b04SRichard Tran Mills   PetscInt  jstart, jend;
487938d9b04SRichard Tran Mills   /* jstart is used in loops to denote the position in iperm where a
488938d9b04SRichard Tran Mills    * group starts; jend denotes the position where it ends.
489938d9b04SRichard Tran Mills    * (jend + 1 is where the next group starts.) */
490938d9b04SRichard Tran Mills   PetscInt    iold, nz;
491938d9b04SRichard Tran Mills   PetscInt    istart, iend, isize;
492938d9b04SRichard Tran Mills   PetscInt    ipos;
493938d9b04SRichard Tran Mills   PetscScalar yp[NDIM];
494938d9b04SRichard Tran Mills   PetscInt    ip[NDIM];
495938d9b04SRichard Tran Mills   /* yp[] and ip[] are treated as vector "registers" for performing
496938d9b04SRichard Tran Mills    * the mat-vec. */
497938d9b04SRichard Tran Mills 
498938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
499938d9b04SRichard Tran Mills   #pragma disjoint(*x, *y, *aa)
500938d9b04SRichard Tran Mills #endif
501938d9b04SRichard Tran Mills 
502938d9b04SRichard Tran Mills   PetscFunctionBegin;
5039566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5049566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, ww, &y, &w));
505938d9b04SRichard Tran Mills 
506938d9b04SRichard Tran Mills   aj = a->j; /* aj[k] gives column index for element aa[k]. */
507938d9b04SRichard Tran Mills   aa = a->a; /* Nonzero elements stored row-by-row. */
508938d9b04SRichard Tran Mills   ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
509938d9b04SRichard Tran Mills 
510938d9b04SRichard Tran Mills   /* Get the info we need about the permutations and groupings. */
511938d9b04SRichard Tran Mills   aijperm = (Mat_SeqAIJPERM *)A->spptr;
512938d9b04SRichard Tran Mills   iperm   = aijperm->iperm;
513938d9b04SRichard Tran Mills   ngroup  = aijperm->ngroup;
514938d9b04SRichard Tran Mills   xgroup  = aijperm->xgroup;
515938d9b04SRichard Tran Mills   nzgroup = aijperm->nzgroup;
516938d9b04SRichard Tran Mills 
517938d9b04SRichard Tran Mills #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
518938d9b04SRichard Tran Mills   fortranmultaddaijperm_(&m, x, ii, aj, aa, y, w);
519938d9b04SRichard Tran Mills #else
520938d9b04SRichard Tran Mills 
521938d9b04SRichard Tran Mills   for (igroup = 0; igroup < ngroup; igroup++) {
522938d9b04SRichard Tran Mills     jstart = xgroup[igroup];
523938d9b04SRichard Tran Mills     jend   = xgroup[igroup + 1] - 1;
524938d9b04SRichard Tran Mills 
525938d9b04SRichard Tran Mills     nz = nzgroup[igroup];
526938d9b04SRichard Tran Mills 
527938d9b04SRichard Tran Mills     /* Handle the special cases where the number of nonzeros per row
528938d9b04SRichard Tran Mills      * in the group is either 0 or 1. */
529938d9b04SRichard Tran Mills     if (nz == 0) {
530938d9b04SRichard Tran Mills       for (i = jstart; i <= jend; i++) {
531938d9b04SRichard Tran Mills         iold    = iperm[i];
532938d9b04SRichard Tran Mills         y[iold] = w[iold];
533938d9b04SRichard Tran Mills       }
5349371c9d4SSatish Balay     } else if (nz == 1) {
535938d9b04SRichard Tran Mills       for (i = jstart; i <= jend; i++) {
536938d9b04SRichard Tran Mills         iold    = iperm[i];
537938d9b04SRichard Tran Mills         ipos    = ai[iold];
538938d9b04SRichard Tran Mills         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
539938d9b04SRichard Tran Mills       }
540938d9b04SRichard Tran Mills     }
541938d9b04SRichard Tran Mills     /* For the general case: */
542938d9b04SRichard Tran Mills     else {
543938d9b04SRichard Tran Mills       /* We work our way through the current group in chunks of NDIM rows
544938d9b04SRichard Tran Mills        * at a time. */
545938d9b04SRichard Tran Mills 
546938d9b04SRichard Tran Mills       for (istart = jstart; istart <= jend; istart += NDIM) {
547938d9b04SRichard Tran Mills         /* Figure out where the chunk of 'isize' rows ends in iperm.
548938d9b04SRichard Tran Mills          * 'isize may of course be less than NDIM for the last chunk. */
549938d9b04SRichard Tran Mills         iend = istart + (NDIM - 1);
550938d9b04SRichard Tran Mills         if (iend > jend) iend = jend;
551938d9b04SRichard Tran Mills         isize = iend - istart + 1;
552938d9b04SRichard Tran Mills 
553938d9b04SRichard Tran Mills         /* Initialize the yp[] array that will be used to hold part of
554938d9b04SRichard Tran Mills          * the permuted results vector, and figure out where in aa each
555938d9b04SRichard Tran Mills          * row of the chunk will begin. */
556938d9b04SRichard Tran Mills         for (i = 0; i < isize; i++) {
557938d9b04SRichard Tran Mills           iold = iperm[istart + i];
558938d9b04SRichard Tran Mills           /* iold is a row number from the matrix A *before* reordering. */
559938d9b04SRichard Tran Mills           ip[i] = ai[iold];
560938d9b04SRichard Tran Mills           /* ip[i] tells us where the ith row of the chunk begins in aa. */
561938d9b04SRichard Tran Mills           yp[i] = w[iold];
562938d9b04SRichard Tran Mills         }
563938d9b04SRichard Tran Mills 
564938d9b04SRichard Tran Mills         /* If the number of zeros per row exceeds the number of rows in
565938d9b04SRichard Tran Mills          * the chunk, we should vectorize along nz, that is, perform the
566938d9b04SRichard Tran Mills          * mat-vec one row at a time as in the usual CSR case. */
567938d9b04SRichard Tran Mills         if (nz > isize) {
568938d9b04SRichard Tran Mills   #if defined(PETSC_HAVE_CRAY_VECTOR)
569938d9b04SRichard Tran Mills     #pragma _CRI preferstream
570938d9b04SRichard Tran Mills   #endif
571938d9b04SRichard Tran Mills           for (i = 0; i < isize; i++) {
572938d9b04SRichard Tran Mills   #if defined(PETSC_HAVE_CRAY_VECTOR)
573938d9b04SRichard Tran Mills     #pragma _CRI prefervector
574938d9b04SRichard Tran Mills   #endif
575938d9b04SRichard Tran Mills             for (j = 0; j < nz; j++) {
576938d9b04SRichard Tran Mills               ipos = ip[i] + j;
577938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
578938d9b04SRichard Tran Mills             }
579938d9b04SRichard Tran Mills           }
580938d9b04SRichard Tran Mills         }
581938d9b04SRichard Tran Mills         /* Otherwise, there are enough rows in the chunk to make it
582938d9b04SRichard Tran Mills          * worthwhile to vectorize across the rows, that is, to do the
583938d9b04SRichard Tran Mills          * matvec by operating with "columns" of the chunk. */
584938d9b04SRichard Tran Mills         else {
585938d9b04SRichard Tran Mills           for (j = 0; j < nz; j++) {
586938d9b04SRichard Tran Mills             for (i = 0; i < isize; i++) {
587938d9b04SRichard Tran Mills               ipos = ip[i] + j;
588938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
589938d9b04SRichard Tran Mills             }
590938d9b04SRichard Tran Mills           }
591938d9b04SRichard Tran Mills         }
592938d9b04SRichard Tran Mills 
593938d9b04SRichard Tran Mills   #if defined(PETSC_HAVE_CRAY_VECTOR)
594938d9b04SRichard Tran Mills     #pragma _CRI ivdep
595938d9b04SRichard Tran Mills   #endif
596938d9b04SRichard Tran Mills         /* Put results from yp[] into non-permuted result vector y. */
597ad540459SPierre Jolivet         for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
598938d9b04SRichard Tran Mills       } /* End processing chunk of isize rows of a group. */
599938d9b04SRichard Tran Mills 
600938d9b04SRichard Tran Mills     } /* End handling matvec for chunk with nz > 1. */
601938d9b04SRichard Tran Mills   }   /* End loop over igroup. */
602938d9b04SRichard Tran Mills 
603938d9b04SRichard Tran Mills #endif
6049566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz));
6059566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
6069566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, ww, &y, &w));
6073ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
608938d9b04SRichard Tran Mills }
609938d9b04SRichard Tran Mills 
610938d9b04SRichard Tran Mills /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
611938d9b04SRichard Tran Mills  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
612938d9b04SRichard Tran Mills  * routine, but can also be used to convert an assembled SeqAIJ matrix
613938d9b04SRichard Tran Mills  * into a SeqAIJPERM one. */
614d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A, MatType type, MatReuse reuse, Mat *newmat)
615d71ae5a4SJacob Faibussowitsch {
616938d9b04SRichard Tran Mills   Mat             B = *newmat;
617938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm;
618938d9b04SRichard Tran Mills   PetscBool       sametype;
619938d9b04SRichard Tran Mills 
620938d9b04SRichard Tran Mills   PetscFunctionBegin;
62148a46eb9SPierre Jolivet   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
6229566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, type, &sametype));
6233ba16761SJacob Faibussowitsch   if (sametype) PetscFunctionReturn(PETSC_SUCCESS);
624938d9b04SRichard Tran Mills 
6254dfa11a4SJacob Faibussowitsch   PetscCall(PetscNew(&aijperm));
626938d9b04SRichard Tran Mills   B->spptr = (void *)aijperm;
627938d9b04SRichard Tran Mills 
628938d9b04SRichard Tran Mills   /* Set function pointers for methods that we inherit from AIJ but override. */
629938d9b04SRichard Tran Mills   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
630938d9b04SRichard Tran Mills   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
631938d9b04SRichard Tran Mills   B->ops->destroy     = MatDestroy_SeqAIJPERM;
632938d9b04SRichard Tran Mills   B->ops->mult        = MatMult_SeqAIJPERM;
633938d9b04SRichard Tran Mills   B->ops->multadd     = MatMultAdd_SeqAIJPERM;
634938d9b04SRichard Tran Mills 
635938d9b04SRichard Tran Mills   aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
636938d9b04SRichard Tran Mills   /* If A has already been assembled, compute the permutation. */
6371baa6e33SBarry Smith   if (A->assembled) PetscCall(MatSeqAIJPERM_create_perm(B));
638938d9b04SRichard Tran Mills 
6399566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
640938d9b04SRichard Tran Mills 
6419566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJPERM));
642938d9b04SRichard Tran Mills   *newmat = B;
6433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
644938d9b04SRichard Tran Mills }
645938d9b04SRichard Tran Mills 
646938d9b04SRichard Tran Mills /*@C
64711a5261eSBarry Smith    MatCreateSeqAIJPERM - Creates a sparse matrix of type `MATSEQAIJPERM`.
64811a5261eSBarry Smith    This type inherits from `MATSEQAIJ`, but calculates some additional permutation
649938d9b04SRichard Tran Mills    information that is used to allow better vectorization of some
65011a5261eSBarry Smith    operations.  At the cost of increased storage, the `MATSEQAIJ` formatted
651938d9b04SRichard Tran Mills    matrix can be copied to a format in which pieces of the matrix are
652938d9b04SRichard Tran Mills    stored in ELLPACK format, allowing the vectorized matrix multiply
65320f4b53cSBarry Smith    routine to use stride-1 memory accesses.
654938d9b04SRichard Tran Mills 
655d083f849SBarry Smith    Collective
656938d9b04SRichard Tran Mills 
657938d9b04SRichard Tran Mills    Input Parameters:
65811a5261eSBarry Smith +  comm - MPI communicator, set to `PETSC_COMM_SELF`
659938d9b04SRichard Tran Mills .  m - number of rows
660938d9b04SRichard Tran Mills .  n - number of columns
66120f4b53cSBarry Smith .  nz - number of nonzeros per row (same for all rows), ignored if `nnz` is given
66220f4b53cSBarry Smith -  nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
663938d9b04SRichard Tran Mills 
664938d9b04SRichard Tran Mills    Output Parameter:
665938d9b04SRichard Tran Mills .  A - the matrix
666938d9b04SRichard Tran Mills 
667938d9b04SRichard Tran Mills    Level: intermediate
668938d9b04SRichard Tran Mills 
669*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()`
670938d9b04SRichard Tran Mills @*/
671d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
672d71ae5a4SJacob Faibussowitsch {
673938d9b04SRichard Tran Mills   PetscFunctionBegin;
6749566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
6759566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
6769566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJPERM));
6779566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, nnz));
6783ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
679938d9b04SRichard Tran Mills }
680938d9b04SRichard Tran Mills 
681d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
682d71ae5a4SJacob Faibussowitsch {
683938d9b04SRichard Tran Mills   PetscFunctionBegin;
6849566063dSJacob Faibussowitsch   PetscCall(MatSetType(A, MATSEQAIJ));
6859566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A, MATSEQAIJPERM, MAT_INPLACE_MATRIX, &A));
6863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
687938d9b04SRichard Tran Mills }
688