xref: /petsc/src/mat/impls/aij/seq/aijperm/aijperm.c (revision 4dfa11a44d5adf2389f1d3acbc8f3c1116dc6c3a)
1938d9b04SRichard Tran Mills 
2938d9b04SRichard Tran Mills /*
3938d9b04SRichard Tran Mills   Defines basic operations for the MATSEQAIJPERM matrix class.
4938d9b04SRichard Tran Mills   This class is derived from the MATSEQAIJ class and retains the
5938d9b04SRichard Tran Mills   compressed row storage (aka Yale sparse matrix format) but augments
6938d9b04SRichard Tran Mills   it with some permutation information that enables some operations
7938d9b04SRichard Tran Mills   to be more vectorizable.  A physically rearranged copy of the matrix
8938d9b04SRichard Tran Mills   may be stored if the user desires.
9938d9b04SRichard Tran Mills 
10938d9b04SRichard Tran Mills   Eventually a variety of permutations may be supported.
11938d9b04SRichard Tran Mills */
12938d9b04SRichard Tran Mills 
13938d9b04SRichard Tran Mills #include <../src/mat/impls/aij/seq/aij.h>
14938d9b04SRichard Tran Mills 
1580423c7aSSatish Balay #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16f67b6f2eSRichard Tran Mills #include <immintrin.h>
17f67b6f2eSRichard Tran Mills 
18f67b6f2eSRichard Tran Mills #if !defined(_MM_SCALE_8)
19f67b6f2eSRichard Tran Mills #define _MM_SCALE_8 8
20f67b6f2eSRichard Tran Mills #endif
21f67b6f2eSRichard Tran Mills #if !defined(_MM_SCALE_4)
22f67b6f2eSRichard Tran Mills #define _MM_SCALE_4 4
23f67b6f2eSRichard Tran Mills #endif
24f67b6f2eSRichard Tran Mills #endif
25f67b6f2eSRichard Tran Mills 
26938d9b04SRichard Tran Mills #define NDIM 512
27938d9b04SRichard Tran Mills /* NDIM specifies how many rows at a time we should work with when
28938d9b04SRichard Tran Mills  * performing the vectorized mat-vec.  This depends on various factors
29938d9b04SRichard Tran Mills  * such as vector register length, etc., and I really need to add a
30938d9b04SRichard Tran Mills  * way for the user (or the library) to tune this.  I'm setting it to
31938d9b04SRichard Tran Mills  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32938d9b04SRichard Tran Mills  * routines. */
33938d9b04SRichard Tran Mills 
34938d9b04SRichard Tran Mills typedef struct {
35938d9b04SRichard Tran Mills   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
36938d9b04SRichard Tran Mills 
37938d9b04SRichard Tran Mills   PetscInt  ngroup;
38938d9b04SRichard Tran Mills   PetscInt *xgroup;
39938d9b04SRichard Tran Mills   /* Denotes where groups of rows with same number of nonzeros
40938d9b04SRichard Tran Mills    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41938d9b04SRichard Tran Mills    * where the ith group begins. */
42938d9b04SRichard Tran Mills 
43938d9b04SRichard Tran Mills   PetscInt *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
44938d9b04SRichard Tran Mills   PetscInt *iperm;   /* The permutation vector. */
45938d9b04SRichard Tran Mills 
46938d9b04SRichard Tran Mills   /* Some of this stuff is for Ed's recursive triangular solve.
47938d9b04SRichard Tran Mills    * I'm not sure what I need yet. */
48938d9b04SRichard Tran Mills   PetscInt   blocksize;
49938d9b04SRichard Tran Mills   PetscInt   nstep;
50938d9b04SRichard Tran Mills   PetscInt  *jstart_list;
51938d9b04SRichard Tran Mills   PetscInt  *jend_list;
52938d9b04SRichard Tran Mills   PetscInt  *action_list;
53938d9b04SRichard Tran Mills   PetscInt  *ngroup_list;
54938d9b04SRichard Tran Mills   PetscInt **ipointer_list;
55938d9b04SRichard Tran Mills   PetscInt **xgroup_list;
56938d9b04SRichard Tran Mills   PetscInt **nzgroup_list;
57938d9b04SRichard Tran Mills   PetscInt **iperm_list;
58938d9b04SRichard Tran Mills } Mat_SeqAIJPERM;
59938d9b04SRichard Tran Mills 
609371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A, MatType type, MatReuse reuse, Mat *newmat) {
61938d9b04SRichard Tran Mills   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
62938d9b04SRichard Tran Mills   /* so we will ignore 'MatType type'. */
63938d9b04SRichard Tran Mills   Mat             B       = *newmat;
64938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
65938d9b04SRichard Tran Mills 
66938d9b04SRichard Tran Mills   PetscFunctionBegin;
67938d9b04SRichard Tran Mills   if (reuse == MAT_INITIAL_MATRIX) {
689566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
69938d9b04SRichard Tran Mills     aijperm = (Mat_SeqAIJPERM *)B->spptr;
70938d9b04SRichard Tran Mills   }
71938d9b04SRichard Tran Mills 
72938d9b04SRichard Tran Mills   /* Reset the original function pointers. */
73938d9b04SRichard Tran Mills   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
74938d9b04SRichard Tran Mills   B->ops->destroy     = MatDestroy_SeqAIJ;
75938d9b04SRichard Tran Mills   B->ops->duplicate   = MatDuplicate_SeqAIJ;
76938d9b04SRichard Tran Mills   B->ops->mult        = MatMult_SeqAIJ;
77938d9b04SRichard Tran Mills   B->ops->multadd     = MatMultAdd_SeqAIJ;
78938d9b04SRichard Tran Mills 
799566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", NULL));
80938d9b04SRichard Tran Mills 
81938d9b04SRichard Tran Mills   /* Free everything in the Mat_SeqAIJPERM data structure.*/
829566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->xgroup));
839566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->nzgroup));
849566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->iperm));
859566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->spptr));
86938d9b04SRichard Tran Mills 
87938d9b04SRichard Tran Mills   /* Change the type of B to MATSEQAIJ. */
889566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ));
89938d9b04SRichard Tran Mills 
90938d9b04SRichard Tran Mills   *newmat = B;
91938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
92938d9b04SRichard Tran Mills }
93938d9b04SRichard Tran Mills 
949371c9d4SSatish Balay PetscErrorCode MatDestroy_SeqAIJPERM(Mat A) {
95938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
96938d9b04SRichard Tran Mills 
97938d9b04SRichard Tran Mills   PetscFunctionBegin;
98938d9b04SRichard Tran Mills   if (aijperm) {
99938d9b04SRichard Tran Mills     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
1009566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm->xgroup));
1019566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm->nzgroup));
1029566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm->iperm));
1039566063dSJacob Faibussowitsch     PetscCall(PetscFree(A->spptr));
104938d9b04SRichard Tran Mills   }
105938d9b04SRichard Tran Mills   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
106938d9b04SRichard Tran Mills    * to destroy everything that remains. */
1079566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ));
108938d9b04SRichard Tran Mills   /* Note that I don't call MatSetType().  I believe this is because that
109938d9b04SRichard Tran Mills    * is only to be called when *building* a matrix.  I could be wrong, but
110938d9b04SRichard Tran Mills    * that is how things work for the SuperLU matrix class. */
1112e956fe4SStefano Zampini   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijperm_seqaij_C", NULL));
1129566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
113938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
114938d9b04SRichard Tran Mills }
115938d9b04SRichard Tran Mills 
1169371c9d4SSatish Balay PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M) {
117938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
118938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm_dest;
119938d9b04SRichard Tran Mills   PetscBool       perm;
120938d9b04SRichard Tran Mills 
121938d9b04SRichard Tran Mills   PetscFunctionBegin;
1229566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, op, M));
1239566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)*M, MATSEQAIJPERM, &perm));
124938d9b04SRichard Tran Mills   if (perm) {
125938d9b04SRichard Tran Mills     aijperm_dest = (Mat_SeqAIJPERM *)(*M)->spptr;
1269566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm_dest->xgroup));
1279566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm_dest->nzgroup));
1289566063dSJacob Faibussowitsch     PetscCall(PetscFree(aijperm_dest->iperm));
129938d9b04SRichard Tran Mills   } else {
130*4dfa11a4SJacob Faibussowitsch     PetscCall(PetscNew(&aijperm_dest));
131938d9b04SRichard Tran Mills     (*M)->spptr = (void *)aijperm_dest;
1329566063dSJacob Faibussowitsch     PetscCall(PetscObjectChangeTypeName((PetscObject)*M, MATSEQAIJPERM));
1339566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)*M, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
134938d9b04SRichard Tran Mills   }
1359566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(aijperm_dest, aijperm, 1));
136938d9b04SRichard Tran Mills   /* Allocate space for, and copy the grouping and permutation info.
137938d9b04SRichard Tran Mills    * I note that when the groups are initially determined in
138938d9b04SRichard Tran Mills    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
139938d9b04SRichard Tran Mills    * necessary.  But at this point, we know how large they need to be, and
140938d9b04SRichard Tran Mills    * allocate only the necessary amount of memory.  So the duplicated matrix
141938d9b04SRichard Tran Mills    * may actually use slightly less storage than the original! */
1429566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(A->rmap->n, &aijperm_dest->iperm));
1439566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(aijperm->ngroup + 1, &aijperm_dest->xgroup));
1449566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup));
1459566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(aijperm_dest->iperm, aijperm->iperm, A->rmap->n));
1469566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(aijperm_dest->xgroup, aijperm->xgroup, aijperm->ngroup + 1));
1479566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(aijperm_dest->nzgroup, aijperm->nzgroup, aijperm->ngroup));
148938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
149938d9b04SRichard Tran Mills }
150938d9b04SRichard Tran Mills 
1519371c9d4SSatish Balay PetscErrorCode MatSeqAIJPERM_create_perm(Mat A) {
152938d9b04SRichard Tran Mills   Mat_SeqAIJ     *a       = (Mat_SeqAIJ *)(A)->data;
153938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
154938d9b04SRichard Tran Mills   PetscInt        m;     /* Number of rows in the matrix. */
155938d9b04SRichard Tran Mills   PetscInt       *ia;    /* From the CSR representation; points to the beginning  of each row. */
156938d9b04SRichard Tran Mills   PetscInt        maxnz; /* Maximum number of nonzeros in any row. */
157938d9b04SRichard Tran Mills   PetscInt       *rows_in_bucket;
158938d9b04SRichard Tran Mills   /* To construct the permutation, we sort each row into one of maxnz
159938d9b04SRichard Tran Mills    * buckets based on how many nonzeros are in the row. */
160938d9b04SRichard Tran Mills   PetscInt        nz;
161938d9b04SRichard Tran Mills   PetscInt       *nz_in_row; /* the number of nonzero elements in row k. */
162938d9b04SRichard Tran Mills   PetscInt       *ipnz;
163938d9b04SRichard Tran Mills   /* When constructing the iperm permutation vector,
164938d9b04SRichard Tran Mills    * ipnz[nz] is used to point to the next place in the permutation vector
165938d9b04SRichard Tran Mills    * that a row with nz nonzero elements should be placed.*/
166938d9b04SRichard Tran Mills   PetscInt        i, ngroup, istart, ipos;
167938d9b04SRichard Tran Mills 
168938d9b04SRichard Tran Mills   PetscFunctionBegin;
169938d9b04SRichard Tran Mills   if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */
170938d9b04SRichard Tran Mills   aijperm->nonzerostate = A->nonzerostate;
171938d9b04SRichard Tran Mills   /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
1729566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->xgroup));
1739566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->nzgroup));
1749566063dSJacob Faibussowitsch   PetscCall(PetscFree(aijperm->iperm));
175938d9b04SRichard Tran Mills 
176938d9b04SRichard Tran Mills   m  = A->rmap->n;
177938d9b04SRichard Tran Mills   ia = a->i;
178938d9b04SRichard Tran Mills 
179938d9b04SRichard Tran Mills   /* Allocate the arrays that will hold the permutation vector. */
1809566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &aijperm->iperm));
181938d9b04SRichard Tran Mills 
182938d9b04SRichard Tran Mills   /* Allocate some temporary work arrays that will be used in
1836aad120cSJose E. Roman    * calculating the permutation vector and groupings. */
1849566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &nz_in_row));
185938d9b04SRichard Tran Mills 
186938d9b04SRichard Tran Mills   /* Now actually figure out the permutation and grouping. */
187938d9b04SRichard Tran Mills 
188938d9b04SRichard Tran Mills   /* First pass: Determine number of nonzeros in each row, maximum
189938d9b04SRichard Tran Mills    * number of nonzeros in any row, and how many rows fall into each
190938d9b04SRichard Tran Mills    * "bucket" of rows with same number of nonzeros. */
191938d9b04SRichard Tran Mills   maxnz = 0;
192938d9b04SRichard Tran Mills   for (i = 0; i < m; i++) {
193938d9b04SRichard Tran Mills     nz_in_row[i] = ia[i + 1] - ia[i];
194938d9b04SRichard Tran Mills     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
195938d9b04SRichard Tran Mills   }
1969566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &rows_in_bucket));
1979566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(PetscMax(maxnz, m) + 1, &ipnz));
198938d9b04SRichard Tran Mills 
199ad540459SPierre Jolivet   for (i = 0; i <= maxnz; i++) rows_in_bucket[i] = 0;
200938d9b04SRichard Tran Mills   for (i = 0; i < m; i++) {
201938d9b04SRichard Tran Mills     nz = nz_in_row[i];
202938d9b04SRichard Tran Mills     rows_in_bucket[nz]++;
203938d9b04SRichard Tran Mills   }
204938d9b04SRichard Tran Mills 
205938d9b04SRichard Tran Mills   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
206938d9b04SRichard Tran Mills    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
207938d9b04SRichard Tran Mills    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
208938d9b04SRichard Tran Mills    * then xgroup[] must consist of (maxnz + 2) elements, since the last
209938d9b04SRichard Tran Mills    * element of xgroup will tell us where the (maxnz + 1)th group ends.
210938d9b04SRichard Tran Mills    * We allocate space for the maximum number of groups;
211938d9b04SRichard Tran Mills    * that is potentially a little wasteful, but not too much so.
212938d9b04SRichard Tran Mills    * Perhaps I should fix it later. */
2139566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(maxnz + 2, &aijperm->xgroup));
2149566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(maxnz + 1, &aijperm->nzgroup));
215938d9b04SRichard Tran Mills 
216938d9b04SRichard Tran Mills   /* Second pass.  Look at what is in the buckets and create the groupings.
217938d9b04SRichard Tran Mills    * Note that it is OK to have a group of rows with no non-zero values. */
218938d9b04SRichard Tran Mills   ngroup = 0;
219938d9b04SRichard Tran Mills   istart = 0;
220938d9b04SRichard Tran Mills   for (i = 0; i <= maxnz; i++) {
221938d9b04SRichard Tran Mills     if (rows_in_bucket[i] > 0) {
222938d9b04SRichard Tran Mills       aijperm->nzgroup[ngroup] = i;
223938d9b04SRichard Tran Mills       aijperm->xgroup[ngroup]  = istart;
224938d9b04SRichard Tran Mills       ngroup++;
225938d9b04SRichard Tran Mills       istart += rows_in_bucket[i];
226938d9b04SRichard Tran Mills     }
227938d9b04SRichard Tran Mills   }
228938d9b04SRichard Tran Mills 
229938d9b04SRichard Tran Mills   aijperm->xgroup[ngroup] = istart;
230938d9b04SRichard Tran Mills   aijperm->ngroup         = ngroup;
231938d9b04SRichard Tran Mills 
232938d9b04SRichard Tran Mills   /* Now fill in the permutation vector iperm. */
233938d9b04SRichard Tran Mills   ipnz[0] = 0;
234ad540459SPierre Jolivet   for (i = 0; i < maxnz; i++) ipnz[i + 1] = ipnz[i] + rows_in_bucket[i];
235938d9b04SRichard Tran Mills 
236938d9b04SRichard Tran Mills   for (i = 0; i < m; i++) {
237938d9b04SRichard Tran Mills     nz                   = nz_in_row[i];
238938d9b04SRichard Tran Mills     ipos                 = ipnz[nz];
239938d9b04SRichard Tran Mills     aijperm->iperm[ipos] = i;
240938d9b04SRichard Tran Mills     ipnz[nz]++;
241938d9b04SRichard Tran Mills   }
242938d9b04SRichard Tran Mills 
243938d9b04SRichard Tran Mills   /* Clean up temporary work arrays. */
2449566063dSJacob Faibussowitsch   PetscCall(PetscFree(rows_in_bucket));
2459566063dSJacob Faibussowitsch   PetscCall(PetscFree(ipnz));
2469566063dSJacob Faibussowitsch   PetscCall(PetscFree(nz_in_row));
247938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
248938d9b04SRichard Tran Mills }
249938d9b04SRichard Tran Mills 
2509371c9d4SSatish Balay PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode) {
251938d9b04SRichard Tran Mills   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
252938d9b04SRichard Tran Mills 
253938d9b04SRichard Tran Mills   PetscFunctionBegin;
254938d9b04SRichard Tran Mills   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
255938d9b04SRichard Tran Mills 
256938d9b04SRichard Tran Mills   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
257938d9b04SRichard Tran Mills    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
258938d9b04SRichard Tran Mills    * I'm not sure if this is the best way to do this, but it avoids
259938d9b04SRichard Tran Mills    * a lot of code duplication.
260938d9b04SRichard Tran Mills    * I also note that currently MATSEQAIJPERM doesn't know anything about
261938d9b04SRichard Tran Mills    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
262938d9b04SRichard Tran Mills    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
263938d9b04SRichard Tran Mills    * this, this may break things.  (Don't know... haven't looked at it.) */
264938d9b04SRichard Tran Mills   a->inode.use = PETSC_FALSE;
2659566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
266938d9b04SRichard Tran Mills 
267938d9b04SRichard Tran Mills   /* Now calculate the permutation and grouping information. */
2689566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJPERM_create_perm(A));
269938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
270938d9b04SRichard Tran Mills }
271938d9b04SRichard Tran Mills 
2729371c9d4SSatish Balay PetscErrorCode MatMult_SeqAIJPERM(Mat A, Vec xx, Vec yy) {
273938d9b04SRichard Tran Mills   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
274938d9b04SRichard Tran Mills   const PetscScalar *x;
275938d9b04SRichard Tran Mills   PetscScalar       *y;
276938d9b04SRichard Tran Mills   const MatScalar   *aa;
277938d9b04SRichard Tran Mills   const PetscInt    *aj, *ai;
278938d9b04SRichard Tran Mills #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
279938d9b04SRichard Tran Mills   PetscInt i, j;
280938d9b04SRichard Tran Mills #endif
28180423c7aSSatish Balay #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
282f67b6f2eSRichard Tran Mills   __m512d  vec_x, vec_y, vec_vals;
283f67b6f2eSRichard Tran Mills   __m256i  vec_idx, vec_ipos, vec_j;
284f67b6f2eSRichard Tran Mills   __mmask8 mask;
285f67b6f2eSRichard Tran Mills #endif
286938d9b04SRichard Tran Mills 
287938d9b04SRichard Tran Mills   /* Variables that don't appear in MatMult_SeqAIJ. */
288938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM *)A->spptr;
289938d9b04SRichard Tran Mills   PetscInt       *iperm; /* Points to the permutation vector. */
290938d9b04SRichard Tran Mills   PetscInt       *xgroup;
291938d9b04SRichard Tran Mills   /* Denotes where groups of rows with same number of nonzeros
292938d9b04SRichard Tran Mills    * begin and end in iperm. */
293938d9b04SRichard Tran Mills   PetscInt       *nzgroup;
294938d9b04SRichard Tran Mills   PetscInt        ngroup;
295938d9b04SRichard Tran Mills   PetscInt        igroup;
296938d9b04SRichard Tran Mills   PetscInt        jstart, jend;
297938d9b04SRichard Tran Mills   /* jstart is used in loops to denote the position in iperm where a
298938d9b04SRichard Tran Mills    * group starts; jend denotes the position where it ends.
299938d9b04SRichard Tran Mills    * (jend + 1 is where the next group starts.) */
300938d9b04SRichard Tran Mills   PetscInt        iold, nz;
301938d9b04SRichard Tran Mills   PetscInt        istart, iend, isize;
302938d9b04SRichard Tran Mills   PetscInt        ipos;
303938d9b04SRichard Tran Mills   PetscScalar     yp[NDIM];
304938d9b04SRichard Tran Mills   PetscInt        ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
305938d9b04SRichard Tran Mills 
306938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
307938d9b04SRichard Tran Mills #pragma disjoint(*x, *y, *aa)
308938d9b04SRichard Tran Mills #endif
309938d9b04SRichard Tran Mills 
310938d9b04SRichard Tran Mills   PetscFunctionBegin;
3119566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3129566063dSJacob Faibussowitsch   PetscCall(VecGetArray(yy, &y));
313938d9b04SRichard Tran Mills   aj = a->j; /* aj[k] gives column index for element aa[k]. */
314938d9b04SRichard Tran Mills   aa = a->a; /* Nonzero elements stored row-by-row. */
315938d9b04SRichard Tran Mills   ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
316938d9b04SRichard Tran Mills 
317938d9b04SRichard Tran Mills   /* Get the info we need about the permutations and groupings. */
318938d9b04SRichard Tran Mills   iperm   = aijperm->iperm;
319938d9b04SRichard Tran Mills   ngroup  = aijperm->ngroup;
320938d9b04SRichard Tran Mills   xgroup  = aijperm->xgroup;
321938d9b04SRichard Tran Mills   nzgroup = aijperm->nzgroup;
322938d9b04SRichard Tran Mills 
323938d9b04SRichard Tran Mills #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
324938d9b04SRichard Tran Mills   fortranmultaijperm_(&m, x, ii, aj, aa, y);
325938d9b04SRichard Tran Mills #else
326938d9b04SRichard Tran Mills 
327938d9b04SRichard Tran Mills   for (igroup = 0; igroup < ngroup; igroup++) {
328938d9b04SRichard Tran Mills     jstart = xgroup[igroup];
329938d9b04SRichard Tran Mills     jend   = xgroup[igroup + 1] - 1;
330938d9b04SRichard Tran Mills     nz     = nzgroup[igroup];
331938d9b04SRichard Tran Mills 
332938d9b04SRichard Tran Mills     /* Handle the special cases where the number of nonzeros per row
333938d9b04SRichard Tran Mills      * in the group is either 0 or 1. */
334938d9b04SRichard Tran Mills     if (nz == 0) {
335ad540459SPierre Jolivet       for (i = jstart; i <= jend; i++) y[iperm[i]] = 0.0;
336938d9b04SRichard Tran Mills     } else if (nz == 1) {
337938d9b04SRichard Tran Mills       for (i = jstart; i <= jend; i++) {
338938d9b04SRichard Tran Mills         iold    = iperm[i];
339938d9b04SRichard Tran Mills         ipos    = ai[iold];
340938d9b04SRichard Tran Mills         y[iold] = aa[ipos] * x[aj[ipos]];
341938d9b04SRichard Tran Mills       }
342938d9b04SRichard Tran Mills     } else {
343938d9b04SRichard Tran Mills       /* We work our way through the current group in chunks of NDIM rows
344938d9b04SRichard Tran Mills        * at a time. */
345938d9b04SRichard Tran Mills 
346938d9b04SRichard Tran Mills       for (istart = jstart; istart <= jend; istart += NDIM) {
347938d9b04SRichard Tran Mills         /* Figure out where the chunk of 'isize' rows ends in iperm.
348938d9b04SRichard Tran Mills          * 'isize may of course be less than NDIM for the last chunk. */
349938d9b04SRichard Tran Mills         iend = istart + (NDIM - 1);
350938d9b04SRichard Tran Mills 
351938d9b04SRichard Tran Mills         if (iend > jend) iend = jend;
352938d9b04SRichard Tran Mills 
353938d9b04SRichard Tran Mills         isize = iend - istart + 1;
354938d9b04SRichard Tran Mills 
355938d9b04SRichard Tran Mills         /* Initialize the yp[] array that will be used to hold part of
356938d9b04SRichard Tran Mills          * the permuted results vector, and figure out where in aa each
357938d9b04SRichard Tran Mills          * row of the chunk will begin. */
358938d9b04SRichard Tran Mills         for (i = 0; i < isize; i++) {
359938d9b04SRichard Tran Mills           iold  = iperm[istart + i];
360938d9b04SRichard Tran Mills           /* iold is a row number from the matrix A *before* reordering. */
361938d9b04SRichard Tran Mills           ip[i] = ai[iold];
362938d9b04SRichard Tran Mills           /* ip[i] tells us where the ith row of the chunk begins in aa. */
363938d9b04SRichard Tran Mills           yp[i] = (PetscScalar)0.0;
364938d9b04SRichard Tran Mills         }
365938d9b04SRichard Tran Mills 
366938d9b04SRichard Tran Mills         /* If the number of zeros per row exceeds the number of rows in
367938d9b04SRichard Tran Mills          * the chunk, we should vectorize along nz, that is, perform the
368938d9b04SRichard Tran Mills          * mat-vec one row at a time as in the usual CSR case. */
369938d9b04SRichard Tran Mills         if (nz > isize) {
370938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
371938d9b04SRichard Tran Mills #pragma _CRI preferstream
372938d9b04SRichard Tran Mills #endif
373938d9b04SRichard Tran Mills           for (i = 0; i < isize; i++) {
374938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
375938d9b04SRichard Tran Mills #pragma _CRI prefervector
376938d9b04SRichard Tran Mills #endif
377f67b6f2eSRichard Tran Mills 
37880423c7aSSatish Balay #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
379f67b6f2eSRichard Tran Mills             vec_y = _mm512_setzero_pd();
380f67b6f2eSRichard Tran Mills             ipos  = ip[i];
381f67b6f2eSRichard Tran Mills             for (j = 0; j < (nz >> 3); j++) {
382f67b6f2eSRichard Tran Mills               vec_idx  = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
383f67b6f2eSRichard Tran Mills               vec_vals = _mm512_loadu_pd(&aa[ipos]);
384f67b6f2eSRichard Tran Mills               vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
385f67b6f2eSRichard Tran Mills               vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
386f67b6f2eSRichard Tran Mills               ipos += 8;
387f67b6f2eSRichard Tran Mills             }
388f67b6f2eSRichard Tran Mills             if ((nz & 0x07) > 2) {
389f67b6f2eSRichard Tran Mills               mask     = (__mmask8)(0xff >> (8 - (nz & 0x07)));
390f67b6f2eSRichard Tran Mills               vec_idx  = _mm256_loadu_si256((__m256i const *)&aj[ipos]);
391f67b6f2eSRichard Tran Mills               vec_vals = _mm512_loadu_pd(&aa[ipos]);
392f67b6f2eSRichard Tran Mills               vec_x    = _mm512_mask_i32gather_pd(vec_x, mask, vec_idx, x, _MM_SCALE_8);
393f67b6f2eSRichard Tran Mills               vec_y    = _mm512_mask3_fmadd_pd(vec_x, vec_vals, vec_y, mask);
394f67b6f2eSRichard Tran Mills             } else if ((nz & 0x07) == 2) {
395f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
396f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos + 1] * x[aj[ipos + 1]];
397f67b6f2eSRichard Tran Mills             } else if ((nz & 0x07) == 1) {
398f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
399f67b6f2eSRichard Tran Mills             }
400f67b6f2eSRichard Tran Mills             yp[i] += _mm512_reduce_add_pd(vec_y);
401f67b6f2eSRichard Tran Mills #else
402938d9b04SRichard Tran Mills             for (j = 0; j < nz; j++) {
403938d9b04SRichard Tran Mills               ipos = ip[i] + j;
404938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
405938d9b04SRichard Tran Mills             }
406f67b6f2eSRichard Tran Mills #endif
407938d9b04SRichard Tran Mills           }
408938d9b04SRichard Tran Mills         } else {
409938d9b04SRichard Tran Mills           /* Otherwise, there are enough rows in the chunk to make it
410938d9b04SRichard Tran Mills            * worthwhile to vectorize across the rows, that is, to do the
411938d9b04SRichard Tran Mills            * matvec by operating with "columns" of the chunk. */
412938d9b04SRichard Tran Mills           for (j = 0; j < nz; j++) {
41380423c7aSSatish Balay #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
414f67b6f2eSRichard Tran Mills             vec_j = _mm256_set1_epi32(j);
415f67b6f2eSRichard Tran Mills             for (i = 0; i < ((isize >> 3) << 3); i += 8) {
416f67b6f2eSRichard Tran Mills               vec_y    = _mm512_loadu_pd(&yp[i]);
417f67b6f2eSRichard Tran Mills               vec_ipos = _mm256_loadu_si256((__m256i const *)&ip[i]);
418f67b6f2eSRichard Tran Mills               vec_ipos = _mm256_add_epi32(vec_ipos, vec_j);
419f67b6f2eSRichard Tran Mills               vec_idx  = _mm256_i32gather_epi32(aj, vec_ipos, _MM_SCALE_4);
420f67b6f2eSRichard Tran Mills               vec_vals = _mm512_i32gather_pd(vec_ipos, aa, _MM_SCALE_8);
421f67b6f2eSRichard Tran Mills               vec_x    = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8);
422f67b6f2eSRichard Tran Mills               vec_y    = _mm512_fmadd_pd(vec_x, vec_vals, vec_y);
423f67b6f2eSRichard Tran Mills               _mm512_storeu_pd(&yp[i], vec_y);
424f67b6f2eSRichard Tran Mills             }
425f67b6f2eSRichard Tran Mills             for (i = isize - (isize & 0x07); i < isize; i++) {
426f67b6f2eSRichard Tran Mills               ipos = ip[i] + j;
427f67b6f2eSRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
428f67b6f2eSRichard Tran Mills             }
429f67b6f2eSRichard Tran Mills #else
430938d9b04SRichard Tran Mills             for (i = 0; i < isize; i++) {
431938d9b04SRichard Tran Mills               ipos = ip[i] + j;
432938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
433938d9b04SRichard Tran Mills             }
434f67b6f2eSRichard Tran Mills #endif
435938d9b04SRichard Tran Mills           }
436938d9b04SRichard Tran Mills         }
437938d9b04SRichard Tran Mills 
438938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
439938d9b04SRichard Tran Mills #pragma _CRI ivdep
440938d9b04SRichard Tran Mills #endif
441938d9b04SRichard Tran Mills         /* Put results from yp[] into non-permuted result vector y. */
442ad540459SPierre Jolivet         for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
443938d9b04SRichard Tran Mills       } /* End processing chunk of isize rows of a group. */
444938d9b04SRichard Tran Mills     }   /* End handling matvec for chunk with nz > 1. */
445938d9b04SRichard Tran Mills   }     /* End loop over igroup. */
446938d9b04SRichard Tran Mills #endif
4479566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(PetscMax(2.0 * a->nz - A->rmap->n, 0)));
4489566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
4499566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(yy, &y));
450938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
451938d9b04SRichard Tran Mills }
452938d9b04SRichard Tran Mills 
453938d9b04SRichard Tran Mills /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
454938d9b04SRichard Tran Mills  * Note that the names I used to designate the vectors differs from that
455938d9b04SRichard Tran Mills  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
456938d9b04SRichard Tran Mills  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
457938d9b04SRichard Tran Mills /*
458938d9b04SRichard Tran Mills     I hate having virtually identical code for the mult and the multadd!!!
459938d9b04SRichard Tran Mills */
4609371c9d4SSatish Balay PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A, Vec xx, Vec ww, Vec yy) {
461938d9b04SRichard Tran Mills   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
462938d9b04SRichard Tran Mills   const PetscScalar *x;
463938d9b04SRichard Tran Mills   PetscScalar       *y, *w;
464938d9b04SRichard Tran Mills   const MatScalar   *aa;
465938d9b04SRichard Tran Mills   const PetscInt    *aj, *ai;
466938d9b04SRichard Tran Mills #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
467938d9b04SRichard Tran Mills   PetscInt i, j;
468938d9b04SRichard Tran Mills #endif
469938d9b04SRichard Tran Mills 
470938d9b04SRichard Tran Mills   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
471938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm;
472938d9b04SRichard Tran Mills   PetscInt       *iperm; /* Points to the permutation vector. */
473938d9b04SRichard Tran Mills   PetscInt       *xgroup;
474938d9b04SRichard Tran Mills   /* Denotes where groups of rows with same number of nonzeros
475938d9b04SRichard Tran Mills    * begin and end in iperm. */
476938d9b04SRichard Tran Mills   PetscInt       *nzgroup;
477938d9b04SRichard Tran Mills   PetscInt        ngroup;
478938d9b04SRichard Tran Mills   PetscInt        igroup;
479938d9b04SRichard Tran Mills   PetscInt        jstart, jend;
480938d9b04SRichard Tran Mills   /* jstart is used in loops to denote the position in iperm where a
481938d9b04SRichard Tran Mills    * group starts; jend denotes the position where it ends.
482938d9b04SRichard Tran Mills    * (jend + 1 is where the next group starts.) */
483938d9b04SRichard Tran Mills   PetscInt        iold, nz;
484938d9b04SRichard Tran Mills   PetscInt        istart, iend, isize;
485938d9b04SRichard Tran Mills   PetscInt        ipos;
486938d9b04SRichard Tran Mills   PetscScalar     yp[NDIM];
487938d9b04SRichard Tran Mills   PetscInt        ip[NDIM];
488938d9b04SRichard Tran Mills   /* yp[] and ip[] are treated as vector "registers" for performing
489938d9b04SRichard Tran Mills    * the mat-vec. */
490938d9b04SRichard Tran Mills 
491938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
492938d9b04SRichard Tran Mills #pragma disjoint(*x, *y, *aa)
493938d9b04SRichard Tran Mills #endif
494938d9b04SRichard Tran Mills 
495938d9b04SRichard Tran Mills   PetscFunctionBegin;
4969566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
4979566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(yy, ww, &y, &w));
498938d9b04SRichard Tran Mills 
499938d9b04SRichard Tran Mills   aj = a->j; /* aj[k] gives column index for element aa[k]. */
500938d9b04SRichard Tran Mills   aa = a->a; /* Nonzero elements stored row-by-row. */
501938d9b04SRichard Tran Mills   ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
502938d9b04SRichard Tran Mills 
503938d9b04SRichard Tran Mills   /* Get the info we need about the permutations and groupings. */
504938d9b04SRichard Tran Mills   aijperm = (Mat_SeqAIJPERM *)A->spptr;
505938d9b04SRichard Tran Mills   iperm   = aijperm->iperm;
506938d9b04SRichard Tran Mills   ngroup  = aijperm->ngroup;
507938d9b04SRichard Tran Mills   xgroup  = aijperm->xgroup;
508938d9b04SRichard Tran Mills   nzgroup = aijperm->nzgroup;
509938d9b04SRichard Tran Mills 
510938d9b04SRichard Tran Mills #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
511938d9b04SRichard Tran Mills   fortranmultaddaijperm_(&m, x, ii, aj, aa, y, w);
512938d9b04SRichard Tran Mills #else
513938d9b04SRichard Tran Mills 
514938d9b04SRichard Tran Mills   for (igroup = 0; igroup < ngroup; igroup++) {
515938d9b04SRichard Tran Mills     jstart = xgroup[igroup];
516938d9b04SRichard Tran Mills     jend   = xgroup[igroup + 1] - 1;
517938d9b04SRichard Tran Mills 
518938d9b04SRichard Tran Mills     nz = nzgroup[igroup];
519938d9b04SRichard Tran Mills 
520938d9b04SRichard Tran Mills     /* Handle the special cases where the number of nonzeros per row
521938d9b04SRichard Tran Mills      * in the group is either 0 or 1. */
522938d9b04SRichard Tran Mills     if (nz == 0) {
523938d9b04SRichard Tran Mills       for (i = jstart; i <= jend; i++) {
524938d9b04SRichard Tran Mills         iold    = iperm[i];
525938d9b04SRichard Tran Mills         y[iold] = w[iold];
526938d9b04SRichard Tran Mills       }
5279371c9d4SSatish Balay     } else if (nz == 1) {
528938d9b04SRichard Tran Mills       for (i = jstart; i <= jend; i++) {
529938d9b04SRichard Tran Mills         iold    = iperm[i];
530938d9b04SRichard Tran Mills         ipos    = ai[iold];
531938d9b04SRichard Tran Mills         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
532938d9b04SRichard Tran Mills       }
533938d9b04SRichard Tran Mills     }
534938d9b04SRichard Tran Mills     /* For the general case: */
535938d9b04SRichard Tran Mills     else {
536938d9b04SRichard Tran Mills       /* We work our way through the current group in chunks of NDIM rows
537938d9b04SRichard Tran Mills        * at a time. */
538938d9b04SRichard Tran Mills 
539938d9b04SRichard Tran Mills       for (istart = jstart; istart <= jend; istart += NDIM) {
540938d9b04SRichard Tran Mills         /* Figure out where the chunk of 'isize' rows ends in iperm.
541938d9b04SRichard Tran Mills          * 'isize may of course be less than NDIM for the last chunk. */
542938d9b04SRichard Tran Mills         iend = istart + (NDIM - 1);
543938d9b04SRichard Tran Mills         if (iend > jend) iend = jend;
544938d9b04SRichard Tran Mills         isize = iend - istart + 1;
545938d9b04SRichard Tran Mills 
546938d9b04SRichard Tran Mills         /* Initialize the yp[] array that will be used to hold part of
547938d9b04SRichard Tran Mills          * the permuted results vector, and figure out where in aa each
548938d9b04SRichard Tran Mills          * row of the chunk will begin. */
549938d9b04SRichard Tran Mills         for (i = 0; i < isize; i++) {
550938d9b04SRichard Tran Mills           iold  = iperm[istart + i];
551938d9b04SRichard Tran Mills           /* iold is a row number from the matrix A *before* reordering. */
552938d9b04SRichard Tran Mills           ip[i] = ai[iold];
553938d9b04SRichard Tran Mills           /* ip[i] tells us where the ith row of the chunk begins in aa. */
554938d9b04SRichard Tran Mills           yp[i] = w[iold];
555938d9b04SRichard Tran Mills         }
556938d9b04SRichard Tran Mills 
557938d9b04SRichard Tran Mills         /* If the number of zeros per row exceeds the number of rows in
558938d9b04SRichard Tran Mills          * the chunk, we should vectorize along nz, that is, perform the
559938d9b04SRichard Tran Mills          * mat-vec one row at a time as in the usual CSR case. */
560938d9b04SRichard Tran Mills         if (nz > isize) {
561938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
562938d9b04SRichard Tran Mills #pragma _CRI preferstream
563938d9b04SRichard Tran Mills #endif
564938d9b04SRichard Tran Mills           for (i = 0; i < isize; i++) {
565938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
566938d9b04SRichard Tran Mills #pragma _CRI prefervector
567938d9b04SRichard Tran Mills #endif
568938d9b04SRichard Tran Mills             for (j = 0; j < nz; j++) {
569938d9b04SRichard Tran Mills               ipos = ip[i] + j;
570938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
571938d9b04SRichard Tran Mills             }
572938d9b04SRichard Tran Mills           }
573938d9b04SRichard Tran Mills         }
574938d9b04SRichard Tran Mills         /* Otherwise, there are enough rows in the chunk to make it
575938d9b04SRichard Tran Mills          * worthwhile to vectorize across the rows, that is, to do the
576938d9b04SRichard Tran Mills          * matvec by operating with "columns" of the chunk. */
577938d9b04SRichard Tran Mills         else {
578938d9b04SRichard Tran Mills           for (j = 0; j < nz; j++) {
579938d9b04SRichard Tran Mills             for (i = 0; i < isize; i++) {
580938d9b04SRichard Tran Mills               ipos = ip[i] + j;
581938d9b04SRichard Tran Mills               yp[i] += aa[ipos] * x[aj[ipos]];
582938d9b04SRichard Tran Mills             }
583938d9b04SRichard Tran Mills           }
584938d9b04SRichard Tran Mills         }
585938d9b04SRichard Tran Mills 
586938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR)
587938d9b04SRichard Tran Mills #pragma _CRI ivdep
588938d9b04SRichard Tran Mills #endif
589938d9b04SRichard Tran Mills         /* Put results from yp[] into non-permuted result vector y. */
590ad540459SPierre Jolivet         for (i = 0; i < isize; i++) y[iperm[istart + i]] = yp[i];
591938d9b04SRichard Tran Mills       } /* End processing chunk of isize rows of a group. */
592938d9b04SRichard Tran Mills 
593938d9b04SRichard Tran Mills     } /* End handling matvec for chunk with nz > 1. */
594938d9b04SRichard Tran Mills   }   /* End loop over igroup. */
595938d9b04SRichard Tran Mills 
596938d9b04SRichard Tran Mills #endif
5979566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz));
5989566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5999566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(yy, ww, &y, &w));
600938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
601938d9b04SRichard Tran Mills }
602938d9b04SRichard Tran Mills 
603938d9b04SRichard Tran Mills /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
604938d9b04SRichard Tran Mills  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
605938d9b04SRichard Tran Mills  * routine, but can also be used to convert an assembled SeqAIJ matrix
606938d9b04SRichard Tran Mills  * into a SeqAIJPERM one. */
6079371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A, MatType type, MatReuse reuse, Mat *newmat) {
608938d9b04SRichard Tran Mills   Mat             B = *newmat;
609938d9b04SRichard Tran Mills   Mat_SeqAIJPERM *aijperm;
610938d9b04SRichard Tran Mills   PetscBool       sametype;
611938d9b04SRichard Tran Mills 
612938d9b04SRichard Tran Mills   PetscFunctionBegin;
61348a46eb9SPierre Jolivet   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(A, MAT_COPY_VALUES, &B));
6149566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, type, &sametype));
615938d9b04SRichard Tran Mills   if (sametype) PetscFunctionReturn(0);
616938d9b04SRichard Tran Mills 
617*4dfa11a4SJacob Faibussowitsch   PetscCall(PetscNew(&aijperm));
618938d9b04SRichard Tran Mills   B->spptr = (void *)aijperm;
619938d9b04SRichard Tran Mills 
620938d9b04SRichard Tran Mills   /* Set function pointers for methods that we inherit from AIJ but override. */
621938d9b04SRichard Tran Mills   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
622938d9b04SRichard Tran Mills   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
623938d9b04SRichard Tran Mills   B->ops->destroy     = MatDestroy_SeqAIJPERM;
624938d9b04SRichard Tran Mills   B->ops->mult        = MatMult_SeqAIJPERM;
625938d9b04SRichard Tran Mills   B->ops->multadd     = MatMultAdd_SeqAIJPERM;
626938d9b04SRichard Tran Mills 
627938d9b04SRichard Tran Mills   aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
628938d9b04SRichard Tran Mills   /* If A has already been assembled, compute the permutation. */
6291baa6e33SBarry Smith   if (A->assembled) PetscCall(MatSeqAIJPERM_create_perm(B));
630938d9b04SRichard Tran Mills 
6319566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijperm_seqaij_C", MatConvert_SeqAIJPERM_SeqAIJ));
632938d9b04SRichard Tran Mills 
6339566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJPERM));
634938d9b04SRichard Tran Mills   *newmat = B;
635938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
636938d9b04SRichard Tran Mills }
637938d9b04SRichard Tran Mills 
638938d9b04SRichard Tran Mills /*@C
63911a5261eSBarry Smith    MatCreateSeqAIJPERM - Creates a sparse matrix of type `MATSEQAIJPERM`.
64011a5261eSBarry Smith    This type inherits from `MATSEQAIJ`, but calculates some additional permutation
641938d9b04SRichard Tran Mills    information that is used to allow better vectorization of some
64211a5261eSBarry Smith    operations.  At the cost of increased storage, the `MATSEQAIJ` formatted
643938d9b04SRichard Tran Mills    matrix can be copied to a format in which pieces of the matrix are
644938d9b04SRichard Tran Mills    stored in ELLPACK format, allowing the vectorized matrix multiply
64511a5261eSBarry Smith    routine to use stride-1 memory accesses.  As with the `MATSEQAIJ` type, it is
646938d9b04SRichard Tran Mills    important to preallocate matrix storage in order to get good assembly
647938d9b04SRichard Tran Mills    performance.
648938d9b04SRichard Tran Mills 
649d083f849SBarry Smith    Collective
650938d9b04SRichard Tran Mills 
651938d9b04SRichard Tran Mills    Input Parameters:
65211a5261eSBarry Smith +  comm - MPI communicator, set to `PETSC_COMM_SELF`
653938d9b04SRichard Tran Mills .  m - number of rows
654938d9b04SRichard Tran Mills .  n - number of columns
655938d9b04SRichard Tran Mills .  nz - number of nonzeros per row (same for all rows)
656938d9b04SRichard Tran Mills -  nnz - array containing the number of nonzeros in the various rows
657938d9b04SRichard Tran Mills          (possibly different for each row) or NULL
658938d9b04SRichard Tran Mills 
659938d9b04SRichard Tran Mills    Output Parameter:
660938d9b04SRichard Tran Mills .  A - the matrix
661938d9b04SRichard Tran Mills 
66211a5261eSBarry Smith    Note:
663938d9b04SRichard Tran Mills    If nnz is given then nz is ignored
664938d9b04SRichard Tran Mills 
665938d9b04SRichard Tran Mills    Level: intermediate
666938d9b04SRichard Tran Mills 
667db781477SPatrick Sanan .seealso: `MatCreate()`, `MatCreateMPIAIJPERM()`, `MatSetValues()`
668938d9b04SRichard Tran Mills @*/
6699371c9d4SSatish Balay PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) {
670938d9b04SRichard Tran Mills   PetscFunctionBegin;
6719566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
6729566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
6739566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJPERM));
6749566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, nnz));
675938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
676938d9b04SRichard Tran Mills }
677938d9b04SRichard Tran Mills 
6789371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A) {
679938d9b04SRichard Tran Mills   PetscFunctionBegin;
6809566063dSJacob Faibussowitsch   PetscCall(MatSetType(A, MATSEQAIJ));
6819566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJPERM(A, MATSEQAIJPERM, MAT_INPLACE_MATRIX, &A));
682938d9b04SRichard Tran Mills   PetscFunctionReturn(0);
683938d9b04SRichard Tran Mills }
684