1938d9b04SRichard Tran Mills 2938d9b04SRichard Tran Mills /* 3938d9b04SRichard Tran Mills Defines basic operations for the MATSEQAIJPERM matrix class. 4938d9b04SRichard Tran Mills This class is derived from the MATSEQAIJ class and retains the 5938d9b04SRichard Tran Mills compressed row storage (aka Yale sparse matrix format) but augments 6938d9b04SRichard Tran Mills it with some permutation information that enables some operations 7938d9b04SRichard Tran Mills to be more vectorizable. A physically rearranged copy of the matrix 8938d9b04SRichard Tran Mills may be stored if the user desires. 9938d9b04SRichard Tran Mills 10938d9b04SRichard Tran Mills Eventually a variety of permutations may be supported. 11938d9b04SRichard Tran Mills */ 12938d9b04SRichard Tran Mills 13938d9b04SRichard Tran Mills #include <../src/mat/impls/aij/seq/aij.h> 14938d9b04SRichard Tran Mills 15*f67b6f2eSRichard Tran Mills #if defined(PETSC_HAVE_IMMINTRIN_H) 16*f67b6f2eSRichard Tran Mills #include <immintrin.h> 17*f67b6f2eSRichard Tran Mills 18*f67b6f2eSRichard Tran Mills #if !defined(_MM_SCALE_8) 19*f67b6f2eSRichard Tran Mills #define _MM_SCALE_8 8 20*f67b6f2eSRichard Tran Mills #endif 21*f67b6f2eSRichard Tran Mills #if !defined(_MM_SCALE_4) 22*f67b6f2eSRichard Tran Mills #define _MM_SCALE_4 4 23*f67b6f2eSRichard Tran Mills #endif 24*f67b6f2eSRichard Tran Mills #endif 25*f67b6f2eSRichard Tran Mills 26938d9b04SRichard Tran Mills #define NDIM 512 27938d9b04SRichard Tran Mills /* NDIM specifies how many rows at a time we should work with when 28938d9b04SRichard Tran Mills * performing the vectorized mat-vec. This depends on various factors 29938d9b04SRichard Tran Mills * such as vector register length, etc., and I really need to add a 30938d9b04SRichard Tran Mills * way for the user (or the library) to tune this. I'm setting it to 31938d9b04SRichard Tran Mills * 512 for now since that is what Ed D'Azevedo was using in his Fortran 32938d9b04SRichard Tran Mills * routines. */ 33938d9b04SRichard Tran Mills 34938d9b04SRichard Tran Mills typedef struct { 35938d9b04SRichard Tran Mills PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */ 36938d9b04SRichard Tran Mills 37938d9b04SRichard Tran Mills PetscInt ngroup; 38938d9b04SRichard Tran Mills PetscInt *xgroup; 39938d9b04SRichard Tran Mills /* Denotes where groups of rows with same number of nonzeros 40938d9b04SRichard Tran Mills * begin and end, i.e., xgroup[i] gives us the position in iperm[] 41938d9b04SRichard Tran Mills * where the ith group begins. */ 42938d9b04SRichard Tran Mills 43938d9b04SRichard Tran Mills PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */ 44938d9b04SRichard Tran Mills PetscInt *iperm; /* The permutation vector. */ 45938d9b04SRichard Tran Mills 46938d9b04SRichard Tran Mills /* Some of this stuff is for Ed's recursive triangular solve. 47938d9b04SRichard Tran Mills * I'm not sure what I need yet. */ 48938d9b04SRichard Tran Mills PetscInt blocksize; 49938d9b04SRichard Tran Mills PetscInt nstep; 50938d9b04SRichard Tran Mills PetscInt *jstart_list; 51938d9b04SRichard Tran Mills PetscInt *jend_list; 52938d9b04SRichard Tran Mills PetscInt *action_list; 53938d9b04SRichard Tran Mills PetscInt *ngroup_list; 54938d9b04SRichard Tran Mills PetscInt **ipointer_list; 55938d9b04SRichard Tran Mills PetscInt **xgroup_list; 56938d9b04SRichard Tran Mills PetscInt **nzgroup_list; 57938d9b04SRichard Tran Mills PetscInt **iperm_list; 58938d9b04SRichard Tran Mills } Mat_SeqAIJPERM; 59938d9b04SRichard Tran Mills 60938d9b04SRichard Tran Mills PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat) 61938d9b04SRichard Tran Mills { 62938d9b04SRichard Tran Mills /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */ 63938d9b04SRichard Tran Mills /* so we will ignore 'MatType type'. */ 64938d9b04SRichard Tran Mills PetscErrorCode ierr; 65938d9b04SRichard Tran Mills Mat B = *newmat; 66938d9b04SRichard Tran Mills Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr; 67938d9b04SRichard Tran Mills 68938d9b04SRichard Tran Mills PetscFunctionBegin; 69938d9b04SRichard Tran Mills if (reuse == MAT_INITIAL_MATRIX) { 70938d9b04SRichard Tran Mills ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr); 71938d9b04SRichard Tran Mills aijperm=(Mat_SeqAIJPERM*)B->spptr; 72938d9b04SRichard Tran Mills } 73938d9b04SRichard Tran Mills 74938d9b04SRichard Tran Mills /* Reset the original function pointers. */ 75938d9b04SRichard Tran Mills B->ops->assemblyend = MatAssemblyEnd_SeqAIJ; 76938d9b04SRichard Tran Mills B->ops->destroy = MatDestroy_SeqAIJ; 77938d9b04SRichard Tran Mills B->ops->duplicate = MatDuplicate_SeqAIJ; 78938d9b04SRichard Tran Mills B->ops->mult = MatMult_SeqAIJ; 79938d9b04SRichard Tran Mills B->ops->multadd = MatMultAdd_SeqAIJ; 80938d9b04SRichard Tran Mills 81938d9b04SRichard Tran Mills ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);CHKERRQ(ierr); 82938d9b04SRichard Tran Mills ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr); 83938d9b04SRichard Tran Mills ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr); 84938d9b04SRichard Tran Mills ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",NULL);CHKERRQ(ierr); 85938d9b04SRichard Tran Mills 86938d9b04SRichard Tran Mills /* Free everything in the Mat_SeqAIJPERM data structure.*/ 87938d9b04SRichard Tran Mills ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 88938d9b04SRichard Tran Mills ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 89938d9b04SRichard Tran Mills ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 90938d9b04SRichard Tran Mills ierr = PetscFree(B->spptr);CHKERRQ(ierr); 91938d9b04SRichard Tran Mills 92938d9b04SRichard Tran Mills /* Change the type of B to MATSEQAIJ. */ 93938d9b04SRichard Tran Mills ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr); 94938d9b04SRichard Tran Mills 95938d9b04SRichard Tran Mills *newmat = B; 96938d9b04SRichard Tran Mills PetscFunctionReturn(0); 97938d9b04SRichard Tran Mills } 98938d9b04SRichard Tran Mills 99938d9b04SRichard Tran Mills PetscErrorCode MatDestroy_SeqAIJPERM(Mat A) 100938d9b04SRichard Tran Mills { 101938d9b04SRichard Tran Mills PetscErrorCode ierr; 102938d9b04SRichard Tran Mills Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 103938d9b04SRichard Tran Mills 104938d9b04SRichard Tran Mills PetscFunctionBegin; 105938d9b04SRichard Tran Mills if (aijperm) { 106938d9b04SRichard Tran Mills /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */ 107938d9b04SRichard Tran Mills ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 108938d9b04SRichard Tran Mills ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 109938d9b04SRichard Tran Mills ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 110938d9b04SRichard Tran Mills ierr = PetscFree(A->spptr);CHKERRQ(ierr); 111938d9b04SRichard Tran Mills } 112938d9b04SRichard Tran Mills /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ() 113938d9b04SRichard Tran Mills * to destroy everything that remains. */ 114938d9b04SRichard Tran Mills ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr); 115938d9b04SRichard Tran Mills /* Note that I don't call MatSetType(). I believe this is because that 116938d9b04SRichard Tran Mills * is only to be called when *building* a matrix. I could be wrong, but 117938d9b04SRichard Tran Mills * that is how things work for the SuperLU matrix class. */ 118938d9b04SRichard Tran Mills ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 119938d9b04SRichard Tran Mills PetscFunctionReturn(0); 120938d9b04SRichard Tran Mills } 121938d9b04SRichard Tran Mills 122938d9b04SRichard Tran Mills PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M) 123938d9b04SRichard Tran Mills { 124938d9b04SRichard Tran Mills PetscErrorCode ierr; 125938d9b04SRichard Tran Mills Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 126938d9b04SRichard Tran Mills Mat_SeqAIJPERM *aijperm_dest; 127938d9b04SRichard Tran Mills PetscBool perm; 128938d9b04SRichard Tran Mills 129938d9b04SRichard Tran Mills PetscFunctionBegin; 130938d9b04SRichard Tran Mills ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr); 131938d9b04SRichard Tran Mills ierr = PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);CHKERRQ(ierr); 132938d9b04SRichard Tran Mills if (perm) { 133938d9b04SRichard Tran Mills aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr; 134938d9b04SRichard Tran Mills ierr = PetscFree(aijperm_dest->xgroup);CHKERRQ(ierr); 135938d9b04SRichard Tran Mills ierr = PetscFree(aijperm_dest->nzgroup);CHKERRQ(ierr); 136938d9b04SRichard Tran Mills ierr = PetscFree(aijperm_dest->iperm);CHKERRQ(ierr); 137938d9b04SRichard Tran Mills } else { 138938d9b04SRichard Tran Mills ierr = PetscNewLog(*M,&aijperm_dest);CHKERRQ(ierr); 139938d9b04SRichard Tran Mills (*M)->spptr = (void*) aijperm_dest; 140938d9b04SRichard Tran Mills ierr = PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);CHKERRQ(ierr); 141938d9b04SRichard Tran Mills ierr = PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr); 142938d9b04SRichard Tran Mills ierr = PetscObjectComposeFunction((PetscObject)*M,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr); 143938d9b04SRichard Tran Mills ierr = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr); 144938d9b04SRichard Tran Mills ierr = PetscObjectComposeFunction((PetscObject)*M,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr); 145938d9b04SRichard Tran Mills } 146938d9b04SRichard Tran Mills ierr = PetscMemcpy(aijperm_dest,aijperm,sizeof(Mat_SeqAIJPERM));CHKERRQ(ierr); 147938d9b04SRichard Tran Mills /* Allocate space for, and copy the grouping and permutation info. 148938d9b04SRichard Tran Mills * I note that when the groups are initially determined in 149938d9b04SRichard Tran Mills * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than 150938d9b04SRichard Tran Mills * necessary. But at this point, we know how large they need to be, and 151938d9b04SRichard Tran Mills * allocate only the necessary amount of memory. So the duplicated matrix 152938d9b04SRichard Tran Mills * may actually use slightly less storage than the original! */ 153938d9b04SRichard Tran Mills ierr = PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);CHKERRQ(ierr); 154938d9b04SRichard Tran Mills ierr = PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);CHKERRQ(ierr); 155938d9b04SRichard Tran Mills ierr = PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);CHKERRQ(ierr); 156938d9b04SRichard Tran Mills ierr = PetscMemcpy(aijperm_dest->iperm,aijperm->iperm,sizeof(PetscInt)*A->rmap->n);CHKERRQ(ierr); 157938d9b04SRichard Tran Mills ierr = PetscMemcpy(aijperm_dest->xgroup,aijperm->xgroup,sizeof(PetscInt)*(aijperm->ngroup+1));CHKERRQ(ierr); 158938d9b04SRichard Tran Mills ierr = PetscMemcpy(aijperm_dest->nzgroup,aijperm->nzgroup,sizeof(PetscInt)*aijperm->ngroup);CHKERRQ(ierr); 159938d9b04SRichard Tran Mills PetscFunctionReturn(0); 160938d9b04SRichard Tran Mills } 161938d9b04SRichard Tran Mills 162938d9b04SRichard Tran Mills PetscErrorCode MatSeqAIJPERM_create_perm(Mat A) 163938d9b04SRichard Tran Mills { 164938d9b04SRichard Tran Mills PetscErrorCode ierr; 165938d9b04SRichard Tran Mills Mat_SeqAIJ *a = (Mat_SeqAIJ*)(A)->data; 166938d9b04SRichard Tran Mills Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 167938d9b04SRichard Tran Mills PetscInt m; /* Number of rows in the matrix. */ 168938d9b04SRichard Tran Mills PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */ 169938d9b04SRichard Tran Mills PetscInt maxnz; /* Maximum number of nonzeros in any row. */ 170938d9b04SRichard Tran Mills PetscInt *rows_in_bucket; 171938d9b04SRichard Tran Mills /* To construct the permutation, we sort each row into one of maxnz 172938d9b04SRichard Tran Mills * buckets based on how many nonzeros are in the row. */ 173938d9b04SRichard Tran Mills PetscInt nz; 174938d9b04SRichard Tran Mills PetscInt *nz_in_row; /* the number of nonzero elements in row k. */ 175938d9b04SRichard Tran Mills PetscInt *ipnz; 176938d9b04SRichard Tran Mills /* When constructing the iperm permutation vector, 177938d9b04SRichard Tran Mills * ipnz[nz] is used to point to the next place in the permutation vector 178938d9b04SRichard Tran Mills * that a row with nz nonzero elements should be placed.*/ 179938d9b04SRichard Tran Mills PetscInt i, ngroup, istart, ipos; 180938d9b04SRichard Tran Mills 181938d9b04SRichard Tran Mills PetscFunctionBegin; 182938d9b04SRichard Tran Mills if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */ 183938d9b04SRichard Tran Mills aijperm->nonzerostate = A->nonzerostate; 184938d9b04SRichard Tran Mills /* Free anything previously put in the Mat_SeqAIJPERM data structure. */ 185938d9b04SRichard Tran Mills ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr); 186938d9b04SRichard Tran Mills ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr); 187938d9b04SRichard Tran Mills ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr); 188938d9b04SRichard Tran Mills 189938d9b04SRichard Tran Mills m = A->rmap->n; 190938d9b04SRichard Tran Mills ia = a->i; 191938d9b04SRichard Tran Mills 192938d9b04SRichard Tran Mills /* Allocate the arrays that will hold the permutation vector. */ 193938d9b04SRichard Tran Mills ierr = PetscMalloc1(m, &aijperm->iperm);CHKERRQ(ierr); 194938d9b04SRichard Tran Mills 195938d9b04SRichard Tran Mills /* Allocate some temporary work arrays that will be used in 196938d9b04SRichard Tran Mills * calculating the permuation vector and groupings. */ 197938d9b04SRichard Tran Mills ierr = PetscMalloc1(m, &nz_in_row);CHKERRQ(ierr); 198938d9b04SRichard Tran Mills 199938d9b04SRichard Tran Mills /* Now actually figure out the permutation and grouping. */ 200938d9b04SRichard Tran Mills 201938d9b04SRichard Tran Mills /* First pass: Determine number of nonzeros in each row, maximum 202938d9b04SRichard Tran Mills * number of nonzeros in any row, and how many rows fall into each 203938d9b04SRichard Tran Mills * "bucket" of rows with same number of nonzeros. */ 204938d9b04SRichard Tran Mills maxnz = 0; 205938d9b04SRichard Tran Mills for (i=0; i<m; i++) { 206938d9b04SRichard Tran Mills nz_in_row[i] = ia[i+1]-ia[i]; 207938d9b04SRichard Tran Mills if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i]; 208938d9b04SRichard Tran Mills } 209938d9b04SRichard Tran Mills ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);CHKERRQ(ierr); 210938d9b04SRichard Tran Mills ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);CHKERRQ(ierr); 211938d9b04SRichard Tran Mills 212938d9b04SRichard Tran Mills for (i=0; i<=maxnz; i++) { 213938d9b04SRichard Tran Mills rows_in_bucket[i] = 0; 214938d9b04SRichard Tran Mills } 215938d9b04SRichard Tran Mills for (i=0; i<m; i++) { 216938d9b04SRichard Tran Mills nz = nz_in_row[i]; 217938d9b04SRichard Tran Mills rows_in_bucket[nz]++; 218938d9b04SRichard Tran Mills } 219938d9b04SRichard Tran Mills 220938d9b04SRichard Tran Mills /* Allocate space for the grouping info. There will be at most (maxnz + 1) 221938d9b04SRichard Tran Mills * groups. (It is maxnz + 1 instead of simply maxnz because there may be 222938d9b04SRichard Tran Mills * rows with no nonzero elements.) If there are (maxnz + 1) groups, 223938d9b04SRichard Tran Mills * then xgroup[] must consist of (maxnz + 2) elements, since the last 224938d9b04SRichard Tran Mills * element of xgroup will tell us where the (maxnz + 1)th group ends. 225938d9b04SRichard Tran Mills * We allocate space for the maximum number of groups; 226938d9b04SRichard Tran Mills * that is potentially a little wasteful, but not too much so. 227938d9b04SRichard Tran Mills * Perhaps I should fix it later. */ 228938d9b04SRichard Tran Mills ierr = PetscMalloc1(maxnz+2, &aijperm->xgroup);CHKERRQ(ierr); 229938d9b04SRichard Tran Mills ierr = PetscMalloc1(maxnz+1, &aijperm->nzgroup);CHKERRQ(ierr); 230938d9b04SRichard Tran Mills 231938d9b04SRichard Tran Mills /* Second pass. Look at what is in the buckets and create the groupings. 232938d9b04SRichard Tran Mills * Note that it is OK to have a group of rows with no non-zero values. */ 233938d9b04SRichard Tran Mills ngroup = 0; 234938d9b04SRichard Tran Mills istart = 0; 235938d9b04SRichard Tran Mills for (i=0; i<=maxnz; i++) { 236938d9b04SRichard Tran Mills if (rows_in_bucket[i] > 0) { 237938d9b04SRichard Tran Mills aijperm->nzgroup[ngroup] = i; 238938d9b04SRichard Tran Mills aijperm->xgroup[ngroup] = istart; 239938d9b04SRichard Tran Mills ngroup++; 240938d9b04SRichard Tran Mills istart += rows_in_bucket[i]; 241938d9b04SRichard Tran Mills } 242938d9b04SRichard Tran Mills } 243938d9b04SRichard Tran Mills 244938d9b04SRichard Tran Mills aijperm->xgroup[ngroup] = istart; 245938d9b04SRichard Tran Mills aijperm->ngroup = ngroup; 246938d9b04SRichard Tran Mills 247938d9b04SRichard Tran Mills /* Now fill in the permutation vector iperm. */ 248938d9b04SRichard Tran Mills ipnz[0] = 0; 249938d9b04SRichard Tran Mills for (i=0; i<maxnz; i++) { 250938d9b04SRichard Tran Mills ipnz[i+1] = ipnz[i] + rows_in_bucket[i]; 251938d9b04SRichard Tran Mills } 252938d9b04SRichard Tran Mills 253938d9b04SRichard Tran Mills for (i=0; i<m; i++) { 254938d9b04SRichard Tran Mills nz = nz_in_row[i]; 255938d9b04SRichard Tran Mills ipos = ipnz[nz]; 256938d9b04SRichard Tran Mills aijperm->iperm[ipos] = i; 257938d9b04SRichard Tran Mills ipnz[nz]++; 258938d9b04SRichard Tran Mills } 259938d9b04SRichard Tran Mills 260938d9b04SRichard Tran Mills /* Clean up temporary work arrays. */ 261938d9b04SRichard Tran Mills ierr = PetscFree(rows_in_bucket);CHKERRQ(ierr); 262938d9b04SRichard Tran Mills ierr = PetscFree(ipnz);CHKERRQ(ierr); 263938d9b04SRichard Tran Mills ierr = PetscFree(nz_in_row);CHKERRQ(ierr); 264938d9b04SRichard Tran Mills PetscFunctionReturn(0); 265938d9b04SRichard Tran Mills } 266938d9b04SRichard Tran Mills 267938d9b04SRichard Tran Mills 268938d9b04SRichard Tran Mills PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode) 269938d9b04SRichard Tran Mills { 270938d9b04SRichard Tran Mills PetscErrorCode ierr; 271938d9b04SRichard Tran Mills Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 272938d9b04SRichard Tran Mills 273938d9b04SRichard Tran Mills PetscFunctionBegin; 274938d9b04SRichard Tran Mills if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 275938d9b04SRichard Tran Mills 276938d9b04SRichard Tran Mills /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some 277938d9b04SRichard Tran Mills * extra information, call the AssemblyEnd routine for a MATSEQAIJ. 278938d9b04SRichard Tran Mills * I'm not sure if this is the best way to do this, but it avoids 279938d9b04SRichard Tran Mills * a lot of code duplication. 280938d9b04SRichard Tran Mills * I also note that currently MATSEQAIJPERM doesn't know anything about 281938d9b04SRichard Tran Mills * the Mat_CompressedRow data structure that SeqAIJ now uses when there 282938d9b04SRichard Tran Mills * are many zero rows. If the SeqAIJ assembly end routine decides to use 283938d9b04SRichard Tran Mills * this, this may break things. (Don't know... haven't looked at it.) */ 284938d9b04SRichard Tran Mills a->inode.use = PETSC_FALSE; 285938d9b04SRichard Tran Mills ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr); 286938d9b04SRichard Tran Mills 287938d9b04SRichard Tran Mills /* Now calculate the permutation and grouping information. */ 288938d9b04SRichard Tran Mills ierr = MatSeqAIJPERM_create_perm(A);CHKERRQ(ierr); 289938d9b04SRichard Tran Mills PetscFunctionReturn(0); 290938d9b04SRichard Tran Mills } 291938d9b04SRichard Tran Mills 292938d9b04SRichard Tran Mills PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy) 293938d9b04SRichard Tran Mills { 294938d9b04SRichard Tran Mills Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 295938d9b04SRichard Tran Mills const PetscScalar *x; 296938d9b04SRichard Tran Mills PetscScalar *y; 297938d9b04SRichard Tran Mills const MatScalar *aa; 298938d9b04SRichard Tran Mills PetscErrorCode ierr; 299938d9b04SRichard Tran Mills const PetscInt *aj,*ai; 300938d9b04SRichard Tran Mills #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)) 301938d9b04SRichard Tran Mills PetscInt i,j; 302938d9b04SRichard Tran Mills #endif 303*f67b6f2eSRichard Tran Mills #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 304*f67b6f2eSRichard Tran Mills __m512d vec_x,vec_y,vec_vals; 305*f67b6f2eSRichard Tran Mills __m256i vec_idx,vec_ipos,vec_j; 306*f67b6f2eSRichard Tran Mills __mmask8 mask; 307*f67b6f2eSRichard Tran Mills #endif 308938d9b04SRichard Tran Mills 309938d9b04SRichard Tran Mills /* Variables that don't appear in MatMult_SeqAIJ. */ 310938d9b04SRichard Tran Mills Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr; 311938d9b04SRichard Tran Mills PetscInt *iperm; /* Points to the permutation vector. */ 312938d9b04SRichard Tran Mills PetscInt *xgroup; 313938d9b04SRichard Tran Mills /* Denotes where groups of rows with same number of nonzeros 314938d9b04SRichard Tran Mills * begin and end in iperm. */ 315938d9b04SRichard Tran Mills PetscInt *nzgroup; 316938d9b04SRichard Tran Mills PetscInt ngroup; 317938d9b04SRichard Tran Mills PetscInt igroup; 318938d9b04SRichard Tran Mills PetscInt jstart,jend; 319938d9b04SRichard Tran Mills /* jstart is used in loops to denote the position in iperm where a 320938d9b04SRichard Tran Mills * group starts; jend denotes the position where it ends. 321938d9b04SRichard Tran Mills * (jend + 1 is where the next group starts.) */ 322938d9b04SRichard Tran Mills PetscInt iold,nz; 323938d9b04SRichard Tran Mills PetscInt istart,iend,isize; 324938d9b04SRichard Tran Mills PetscInt ipos; 325938d9b04SRichard Tran Mills PetscScalar yp[NDIM]; 326938d9b04SRichard Tran Mills PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */ 327938d9b04SRichard Tran Mills 328938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 329938d9b04SRichard Tran Mills #pragma disjoint(*x,*y,*aa) 330938d9b04SRichard Tran Mills #endif 331938d9b04SRichard Tran Mills 332938d9b04SRichard Tran Mills PetscFunctionBegin; 333938d9b04SRichard Tran Mills ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 334938d9b04SRichard Tran Mills ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 335938d9b04SRichard Tran Mills aj = a->j; /* aj[k] gives column index for element aa[k]. */ 336938d9b04SRichard Tran Mills aa = a->a; /* Nonzero elements stored row-by-row. */ 337938d9b04SRichard Tran Mills ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 338938d9b04SRichard Tran Mills 339938d9b04SRichard Tran Mills /* Get the info we need about the permutations and groupings. */ 340938d9b04SRichard Tran Mills iperm = aijperm->iperm; 341938d9b04SRichard Tran Mills ngroup = aijperm->ngroup; 342938d9b04SRichard Tran Mills xgroup = aijperm->xgroup; 343938d9b04SRichard Tran Mills nzgroup = aijperm->nzgroup; 344938d9b04SRichard Tran Mills 345938d9b04SRichard Tran Mills #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking) 346938d9b04SRichard Tran Mills fortranmultaijperm_(&m,x,ii,aj,aa,y); 347938d9b04SRichard Tran Mills #else 348938d9b04SRichard Tran Mills 349938d9b04SRichard Tran Mills for (igroup=0; igroup<ngroup; igroup++) { 350938d9b04SRichard Tran Mills jstart = xgroup[igroup]; 351938d9b04SRichard Tran Mills jend = xgroup[igroup+1] - 1; 352938d9b04SRichard Tran Mills nz = nzgroup[igroup]; 353938d9b04SRichard Tran Mills 354938d9b04SRichard Tran Mills /* Handle the special cases where the number of nonzeros per row 355938d9b04SRichard Tran Mills * in the group is either 0 or 1. */ 356938d9b04SRichard Tran Mills if (nz == 0) { 357938d9b04SRichard Tran Mills for (i=jstart; i<=jend; i++) { 358938d9b04SRichard Tran Mills y[iperm[i]] = 0.0; 359938d9b04SRichard Tran Mills } 360938d9b04SRichard Tran Mills } else if (nz == 1) { 361938d9b04SRichard Tran Mills for (i=jstart; i<=jend; i++) { 362938d9b04SRichard Tran Mills iold = iperm[i]; 363938d9b04SRichard Tran Mills ipos = ai[iold]; 364938d9b04SRichard Tran Mills y[iold] = aa[ipos] * x[aj[ipos]]; 365938d9b04SRichard Tran Mills } 366938d9b04SRichard Tran Mills } else { 367938d9b04SRichard Tran Mills 368938d9b04SRichard Tran Mills /* We work our way through the current group in chunks of NDIM rows 369938d9b04SRichard Tran Mills * at a time. */ 370938d9b04SRichard Tran Mills 371938d9b04SRichard Tran Mills for (istart=jstart; istart<=jend; istart+=NDIM) { 372938d9b04SRichard Tran Mills /* Figure out where the chunk of 'isize' rows ends in iperm. 373938d9b04SRichard Tran Mills * 'isize may of course be less than NDIM for the last chunk. */ 374938d9b04SRichard Tran Mills iend = istart + (NDIM - 1); 375938d9b04SRichard Tran Mills 376938d9b04SRichard Tran Mills if (iend > jend) iend = jend; 377938d9b04SRichard Tran Mills 378938d9b04SRichard Tran Mills isize = iend - istart + 1; 379938d9b04SRichard Tran Mills 380938d9b04SRichard Tran Mills /* Initialize the yp[] array that will be used to hold part of 381938d9b04SRichard Tran Mills * the permuted results vector, and figure out where in aa each 382938d9b04SRichard Tran Mills * row of the chunk will begin. */ 383938d9b04SRichard Tran Mills for (i=0; i<isize; i++) { 384938d9b04SRichard Tran Mills iold = iperm[istart + i]; 385938d9b04SRichard Tran Mills /* iold is a row number from the matrix A *before* reordering. */ 386938d9b04SRichard Tran Mills ip[i] = ai[iold]; 387938d9b04SRichard Tran Mills /* ip[i] tells us where the ith row of the chunk begins in aa. */ 388938d9b04SRichard Tran Mills yp[i] = (PetscScalar) 0.0; 389938d9b04SRichard Tran Mills } 390938d9b04SRichard Tran Mills 391938d9b04SRichard Tran Mills /* If the number of zeros per row exceeds the number of rows in 392938d9b04SRichard Tran Mills * the chunk, we should vectorize along nz, that is, perform the 393938d9b04SRichard Tran Mills * mat-vec one row at a time as in the usual CSR case. */ 394938d9b04SRichard Tran Mills if (nz > isize) { 395938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR) 396938d9b04SRichard Tran Mills #pragma _CRI preferstream 397938d9b04SRichard Tran Mills #endif 398938d9b04SRichard Tran Mills for (i=0; i<isize; i++) { 399938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR) 400938d9b04SRichard Tran Mills #pragma _CRI prefervector 401938d9b04SRichard Tran Mills #endif 402*f67b6f2eSRichard Tran Mills 403*f67b6f2eSRichard Tran Mills #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 404*f67b6f2eSRichard Tran Mills vec_y = _mm512_setzero_pd(); 405*f67b6f2eSRichard Tran Mills ipos = ip[i]; 406*f67b6f2eSRichard Tran Mills for (j=0; j<(nz>>3); j++) { 407*f67b6f2eSRichard Tran Mills vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]); 408*f67b6f2eSRichard Tran Mills vec_vals = _mm512_loadu_pd(&aa[ipos]); 409*f67b6f2eSRichard Tran Mills vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); 410*f67b6f2eSRichard Tran Mills vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y); 411*f67b6f2eSRichard Tran Mills ipos += 8; 412*f67b6f2eSRichard Tran Mills } 413*f67b6f2eSRichard Tran Mills if ((nz&0x07)>2) { 414*f67b6f2eSRichard Tran Mills mask = (__mmask8)(0xff >> (8-(nz&0x07))); 415*f67b6f2eSRichard Tran Mills vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]); 416*f67b6f2eSRichard Tran Mills vec_vals = _mm512_loadu_pd(&aa[ipos]); 417*f67b6f2eSRichard Tran Mills vec_x = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8); 418*f67b6f2eSRichard Tran Mills vec_y = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask); 419*f67b6f2eSRichard Tran Mills } else if ((nz&0x07)==2) { 420*f67b6f2eSRichard Tran Mills yp[i] += aa[ipos]*x[aj[ipos]]; 421*f67b6f2eSRichard Tran Mills yp[i] += aa[ipos+1]*x[aj[ipos+1]]; 422*f67b6f2eSRichard Tran Mills } else if ((nz&0x07)==1) { 423*f67b6f2eSRichard Tran Mills yp[i] += aa[ipos]*x[aj[ipos]]; 424*f67b6f2eSRichard Tran Mills } 425*f67b6f2eSRichard Tran Mills yp[i] += _mm512_reduce_add_pd(vec_y); 426*f67b6f2eSRichard Tran Mills #else 427938d9b04SRichard Tran Mills for (j=0; j<nz; j++) { 428938d9b04SRichard Tran Mills ipos = ip[i] + j; 429938d9b04SRichard Tran Mills yp[i] += aa[ipos] * x[aj[ipos]]; 430938d9b04SRichard Tran Mills } 431*f67b6f2eSRichard Tran Mills #endif 432938d9b04SRichard Tran Mills } 433938d9b04SRichard Tran Mills } else { 434938d9b04SRichard Tran Mills /* Otherwise, there are enough rows in the chunk to make it 435938d9b04SRichard Tran Mills * worthwhile to vectorize across the rows, that is, to do the 436938d9b04SRichard Tran Mills * matvec by operating with "columns" of the chunk. */ 437938d9b04SRichard Tran Mills for (j=0; j<nz; j++) { 438*f67b6f2eSRichard Tran Mills #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 439*f67b6f2eSRichard Tran Mills vec_j = _mm256_set1_epi32(j); 440*f67b6f2eSRichard Tran Mills for (i=0; i<((isize>>3)<<3); i+=8) { 441*f67b6f2eSRichard Tran Mills vec_y = _mm512_loadu_pd(&yp[i]); 442*f67b6f2eSRichard Tran Mills vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]); 443*f67b6f2eSRichard Tran Mills vec_ipos = _mm256_add_epi32(vec_ipos,vec_j); 444*f67b6f2eSRichard Tran Mills vec_idx = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4); 445*f67b6f2eSRichard Tran Mills vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8); 446*f67b6f2eSRichard Tran Mills vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8); 447*f67b6f2eSRichard Tran Mills vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y); 448*f67b6f2eSRichard Tran Mills _mm512_storeu_pd(&yp[i],vec_y); 449*f67b6f2eSRichard Tran Mills } 450*f67b6f2eSRichard Tran Mills for (i=isize-(isize&0x07); i<isize; i++) { 451*f67b6f2eSRichard Tran Mills ipos = ip[i]+j; 452*f67b6f2eSRichard Tran Mills yp[i] += aa[ipos]*x[aj[ipos]]; 453*f67b6f2eSRichard Tran Mills } 454*f67b6f2eSRichard Tran Mills #else 455938d9b04SRichard Tran Mills for (i=0; i<isize; i++) { 456938d9b04SRichard Tran Mills ipos = ip[i] + j; 457938d9b04SRichard Tran Mills yp[i] += aa[ipos] * x[aj[ipos]]; 458938d9b04SRichard Tran Mills } 459*f67b6f2eSRichard Tran Mills #endif 460938d9b04SRichard Tran Mills } 461938d9b04SRichard Tran Mills } 462938d9b04SRichard Tran Mills 463938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR) 464938d9b04SRichard Tran Mills #pragma _CRI ivdep 465938d9b04SRichard Tran Mills #endif 466938d9b04SRichard Tran Mills /* Put results from yp[] into non-permuted result vector y. */ 467938d9b04SRichard Tran Mills for (i=0; i<isize; i++) { 468938d9b04SRichard Tran Mills y[iperm[istart+i]] = yp[i]; 469938d9b04SRichard Tran Mills } 470938d9b04SRichard Tran Mills } /* End processing chunk of isize rows of a group. */ 471938d9b04SRichard Tran Mills } /* End handling matvec for chunk with nz > 1. */ 472938d9b04SRichard Tran Mills } /* End loop over igroup. */ 473938d9b04SRichard Tran Mills #endif 474938d9b04SRichard Tran Mills ierr = PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));CHKERRQ(ierr); 475938d9b04SRichard Tran Mills ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 476938d9b04SRichard Tran Mills ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 477938d9b04SRichard Tran Mills PetscFunctionReturn(0); 478938d9b04SRichard Tran Mills } 479938d9b04SRichard Tran Mills 480938d9b04SRichard Tran Mills 481938d9b04SRichard Tran Mills /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx. 482938d9b04SRichard Tran Mills * Note that the names I used to designate the vectors differs from that 483938d9b04SRichard Tran Mills * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent 484938d9b04SRichard Tran Mills * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */ 485938d9b04SRichard Tran Mills /* 486938d9b04SRichard Tran Mills I hate having virtually identical code for the mult and the multadd!!! 487938d9b04SRichard Tran Mills */ 488938d9b04SRichard Tran Mills PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy) 489938d9b04SRichard Tran Mills { 490938d9b04SRichard Tran Mills Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 491938d9b04SRichard Tran Mills const PetscScalar *x; 492938d9b04SRichard Tran Mills PetscScalar *y,*w; 493938d9b04SRichard Tran Mills const MatScalar *aa; 494938d9b04SRichard Tran Mills PetscErrorCode ierr; 495938d9b04SRichard Tran Mills const PetscInt *aj,*ai; 496938d9b04SRichard Tran Mills #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 497938d9b04SRichard Tran Mills PetscInt i,j; 498938d9b04SRichard Tran Mills #endif 499938d9b04SRichard Tran Mills 500938d9b04SRichard Tran Mills /* Variables that don't appear in MatMultAdd_SeqAIJ. */ 501938d9b04SRichard Tran Mills Mat_SeqAIJPERM * aijperm; 502938d9b04SRichard Tran Mills PetscInt *iperm; /* Points to the permutation vector. */ 503938d9b04SRichard Tran Mills PetscInt *xgroup; 504938d9b04SRichard Tran Mills /* Denotes where groups of rows with same number of nonzeros 505938d9b04SRichard Tran Mills * begin and end in iperm. */ 506938d9b04SRichard Tran Mills PetscInt *nzgroup; 507938d9b04SRichard Tran Mills PetscInt ngroup; 508938d9b04SRichard Tran Mills PetscInt igroup; 509938d9b04SRichard Tran Mills PetscInt jstart,jend; 510938d9b04SRichard Tran Mills /* jstart is used in loops to denote the position in iperm where a 511938d9b04SRichard Tran Mills * group starts; jend denotes the position where it ends. 512938d9b04SRichard Tran Mills * (jend + 1 is where the next group starts.) */ 513938d9b04SRichard Tran Mills PetscInt iold,nz; 514938d9b04SRichard Tran Mills PetscInt istart,iend,isize; 515938d9b04SRichard Tran Mills PetscInt ipos; 516938d9b04SRichard Tran Mills PetscScalar yp[NDIM]; 517938d9b04SRichard Tran Mills PetscInt ip[NDIM]; 518938d9b04SRichard Tran Mills /* yp[] and ip[] are treated as vector "registers" for performing 519938d9b04SRichard Tran Mills * the mat-vec. */ 520938d9b04SRichard Tran Mills 521938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 522938d9b04SRichard Tran Mills #pragma disjoint(*x,*y,*aa) 523938d9b04SRichard Tran Mills #endif 524938d9b04SRichard Tran Mills 525938d9b04SRichard Tran Mills PetscFunctionBegin; 526938d9b04SRichard Tran Mills ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 527938d9b04SRichard Tran Mills ierr = VecGetArrayPair(yy,ww,&y,&w);CHKERRQ(ierr); 528938d9b04SRichard Tran Mills 529938d9b04SRichard Tran Mills aj = a->j; /* aj[k] gives column index for element aa[k]. */ 530938d9b04SRichard Tran Mills aa = a->a; /* Nonzero elements stored row-by-row. */ 531938d9b04SRichard Tran Mills ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */ 532938d9b04SRichard Tran Mills 533938d9b04SRichard Tran Mills /* Get the info we need about the permutations and groupings. */ 534938d9b04SRichard Tran Mills aijperm = (Mat_SeqAIJPERM*) A->spptr; 535938d9b04SRichard Tran Mills iperm = aijperm->iperm; 536938d9b04SRichard Tran Mills ngroup = aijperm->ngroup; 537938d9b04SRichard Tran Mills xgroup = aijperm->xgroup; 538938d9b04SRichard Tran Mills nzgroup = aijperm->nzgroup; 539938d9b04SRichard Tran Mills 540938d9b04SRichard Tran Mills #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM) 541938d9b04SRichard Tran Mills fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w); 542938d9b04SRichard Tran Mills #else 543938d9b04SRichard Tran Mills 544938d9b04SRichard Tran Mills for (igroup=0; igroup<ngroup; igroup++) { 545938d9b04SRichard Tran Mills jstart = xgroup[igroup]; 546938d9b04SRichard Tran Mills jend = xgroup[igroup+1] - 1; 547938d9b04SRichard Tran Mills 548938d9b04SRichard Tran Mills nz = nzgroup[igroup]; 549938d9b04SRichard Tran Mills 550938d9b04SRichard Tran Mills /* Handle the special cases where the number of nonzeros per row 551938d9b04SRichard Tran Mills * in the group is either 0 or 1. */ 552938d9b04SRichard Tran Mills if (nz == 0) { 553938d9b04SRichard Tran Mills for (i=jstart; i<=jend; i++) { 554938d9b04SRichard Tran Mills iold = iperm[i]; 555938d9b04SRichard Tran Mills y[iold] = w[iold]; 556938d9b04SRichard Tran Mills } 557938d9b04SRichard Tran Mills } 558938d9b04SRichard Tran Mills else if (nz == 1) { 559938d9b04SRichard Tran Mills for (i=jstart; i<=jend; i++) { 560938d9b04SRichard Tran Mills iold = iperm[i]; 561938d9b04SRichard Tran Mills ipos = ai[iold]; 562938d9b04SRichard Tran Mills y[iold] = w[iold] + aa[ipos] * x[aj[ipos]]; 563938d9b04SRichard Tran Mills } 564938d9b04SRichard Tran Mills } 565938d9b04SRichard Tran Mills /* For the general case: */ 566938d9b04SRichard Tran Mills else { 567938d9b04SRichard Tran Mills 568938d9b04SRichard Tran Mills /* We work our way through the current group in chunks of NDIM rows 569938d9b04SRichard Tran Mills * at a time. */ 570938d9b04SRichard Tran Mills 571938d9b04SRichard Tran Mills for (istart=jstart; istart<=jend; istart+=NDIM) { 572938d9b04SRichard Tran Mills /* Figure out where the chunk of 'isize' rows ends in iperm. 573938d9b04SRichard Tran Mills * 'isize may of course be less than NDIM for the last chunk. */ 574938d9b04SRichard Tran Mills iend = istart + (NDIM - 1); 575938d9b04SRichard Tran Mills if (iend > jend) iend = jend; 576938d9b04SRichard Tran Mills isize = iend - istart + 1; 577938d9b04SRichard Tran Mills 578938d9b04SRichard Tran Mills /* Initialize the yp[] array that will be used to hold part of 579938d9b04SRichard Tran Mills * the permuted results vector, and figure out where in aa each 580938d9b04SRichard Tran Mills * row of the chunk will begin. */ 581938d9b04SRichard Tran Mills for (i=0; i<isize; i++) { 582938d9b04SRichard Tran Mills iold = iperm[istart + i]; 583938d9b04SRichard Tran Mills /* iold is a row number from the matrix A *before* reordering. */ 584938d9b04SRichard Tran Mills ip[i] = ai[iold]; 585938d9b04SRichard Tran Mills /* ip[i] tells us where the ith row of the chunk begins in aa. */ 586938d9b04SRichard Tran Mills yp[i] = w[iold]; 587938d9b04SRichard Tran Mills } 588938d9b04SRichard Tran Mills 589938d9b04SRichard Tran Mills /* If the number of zeros per row exceeds the number of rows in 590938d9b04SRichard Tran Mills * the chunk, we should vectorize along nz, that is, perform the 591938d9b04SRichard Tran Mills * mat-vec one row at a time as in the usual CSR case. */ 592938d9b04SRichard Tran Mills if (nz > isize) { 593938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR) 594938d9b04SRichard Tran Mills #pragma _CRI preferstream 595938d9b04SRichard Tran Mills #endif 596938d9b04SRichard Tran Mills for (i=0; i<isize; i++) { 597938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR) 598938d9b04SRichard Tran Mills #pragma _CRI prefervector 599938d9b04SRichard Tran Mills #endif 600938d9b04SRichard Tran Mills for (j=0; j<nz; j++) { 601938d9b04SRichard Tran Mills ipos = ip[i] + j; 602938d9b04SRichard Tran Mills yp[i] += aa[ipos] * x[aj[ipos]]; 603938d9b04SRichard Tran Mills } 604938d9b04SRichard Tran Mills } 605938d9b04SRichard Tran Mills } 606938d9b04SRichard Tran Mills /* Otherwise, there are enough rows in the chunk to make it 607938d9b04SRichard Tran Mills * worthwhile to vectorize across the rows, that is, to do the 608938d9b04SRichard Tran Mills * matvec by operating with "columns" of the chunk. */ 609938d9b04SRichard Tran Mills else { 610938d9b04SRichard Tran Mills for (j=0; j<nz; j++) { 611938d9b04SRichard Tran Mills for (i=0; i<isize; i++) { 612938d9b04SRichard Tran Mills ipos = ip[i] + j; 613938d9b04SRichard Tran Mills yp[i] += aa[ipos] * x[aj[ipos]]; 614938d9b04SRichard Tran Mills } 615938d9b04SRichard Tran Mills } 616938d9b04SRichard Tran Mills } 617938d9b04SRichard Tran Mills 618938d9b04SRichard Tran Mills #if defined(PETSC_HAVE_CRAY_VECTOR) 619938d9b04SRichard Tran Mills #pragma _CRI ivdep 620938d9b04SRichard Tran Mills #endif 621938d9b04SRichard Tran Mills /* Put results from yp[] into non-permuted result vector y. */ 622938d9b04SRichard Tran Mills for (i=0; i<isize; i++) { 623938d9b04SRichard Tran Mills y[iperm[istart+i]] = yp[i]; 624938d9b04SRichard Tran Mills } 625938d9b04SRichard Tran Mills } /* End processing chunk of isize rows of a group. */ 626938d9b04SRichard Tran Mills 627938d9b04SRichard Tran Mills } /* End handling matvec for chunk with nz > 1. */ 628938d9b04SRichard Tran Mills } /* End loop over igroup. */ 629938d9b04SRichard Tran Mills 630938d9b04SRichard Tran Mills #endif 631938d9b04SRichard Tran Mills ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 632938d9b04SRichard Tran Mills ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 633938d9b04SRichard Tran Mills ierr = VecRestoreArrayPair(yy,ww,&y,&w);CHKERRQ(ierr); 634938d9b04SRichard Tran Mills PetscFunctionReturn(0); 635938d9b04SRichard Tran Mills } 636938d9b04SRichard Tran Mills 637938d9b04SRichard Tran Mills 638938d9b04SRichard Tran Mills /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a 639938d9b04SRichard Tran Mills * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM() 640938d9b04SRichard Tran Mills * routine, but can also be used to convert an assembled SeqAIJ matrix 641938d9b04SRichard Tran Mills * into a SeqAIJPERM one. */ 642938d9b04SRichard Tran Mills PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat) 643938d9b04SRichard Tran Mills { 644938d9b04SRichard Tran Mills PetscErrorCode ierr; 645938d9b04SRichard Tran Mills Mat B = *newmat; 646938d9b04SRichard Tran Mills Mat_SeqAIJPERM *aijperm; 647938d9b04SRichard Tran Mills PetscBool sametype; 648938d9b04SRichard Tran Mills 649938d9b04SRichard Tran Mills PetscFunctionBegin; 650938d9b04SRichard Tran Mills if (reuse == MAT_INITIAL_MATRIX) { 651938d9b04SRichard Tran Mills ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr); 652938d9b04SRichard Tran Mills } 653938d9b04SRichard Tran Mills ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr); 654938d9b04SRichard Tran Mills if (sametype) PetscFunctionReturn(0); 655938d9b04SRichard Tran Mills 656938d9b04SRichard Tran Mills ierr = PetscNewLog(B,&aijperm);CHKERRQ(ierr); 657938d9b04SRichard Tran Mills B->spptr = (void*) aijperm; 658938d9b04SRichard Tran Mills 659938d9b04SRichard Tran Mills /* Set function pointers for methods that we inherit from AIJ but override. */ 660938d9b04SRichard Tran Mills B->ops->duplicate = MatDuplicate_SeqAIJPERM; 661938d9b04SRichard Tran Mills B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM; 662938d9b04SRichard Tran Mills B->ops->destroy = MatDestroy_SeqAIJPERM; 663938d9b04SRichard Tran Mills B->ops->mult = MatMult_SeqAIJPERM; 664938d9b04SRichard Tran Mills B->ops->multadd = MatMultAdd_SeqAIJPERM; 665938d9b04SRichard Tran Mills 666938d9b04SRichard Tran Mills aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/ 667938d9b04SRichard Tran Mills /* If A has already been assembled, compute the permutation. */ 668938d9b04SRichard Tran Mills if (A->assembled) { 669938d9b04SRichard Tran Mills ierr = MatSeqAIJPERM_create_perm(B);CHKERRQ(ierr); 670938d9b04SRichard Tran Mills } 671938d9b04SRichard Tran Mills 672938d9b04SRichard Tran Mills ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr); 673938d9b04SRichard Tran Mills ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMult_seqdense_seqaijperm_C",MatMatMult_SeqDense_SeqAIJ);CHKERRQ(ierr); 674938d9b04SRichard Tran Mills ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultSymbolic_seqdense_seqaijperm_C",MatMatMultSymbolic_SeqDense_SeqAIJ);CHKERRQ(ierr); 675938d9b04SRichard Tran Mills ierr = PetscObjectComposeFunction((PetscObject)B,"MatMatMultNumeric_seqdense_seqaijperm_C",MatMatMultNumeric_SeqDense_SeqAIJ);CHKERRQ(ierr); 676938d9b04SRichard Tran Mills 677938d9b04SRichard Tran Mills ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);CHKERRQ(ierr); 678938d9b04SRichard Tran Mills *newmat = B; 679938d9b04SRichard Tran Mills PetscFunctionReturn(0); 680938d9b04SRichard Tran Mills } 681938d9b04SRichard Tran Mills 682938d9b04SRichard Tran Mills /*@C 683938d9b04SRichard Tran Mills MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM. 684938d9b04SRichard Tran Mills This type inherits from AIJ, but calculates some additional permutation 685938d9b04SRichard Tran Mills information that is used to allow better vectorization of some 686938d9b04SRichard Tran Mills operations. At the cost of increased storage, the AIJ formatted 687938d9b04SRichard Tran Mills matrix can be copied to a format in which pieces of the matrix are 688938d9b04SRichard Tran Mills stored in ELLPACK format, allowing the vectorized matrix multiply 689938d9b04SRichard Tran Mills routine to use stride-1 memory accesses. As with the AIJ type, it is 690938d9b04SRichard Tran Mills important to preallocate matrix storage in order to get good assembly 691938d9b04SRichard Tran Mills performance. 692938d9b04SRichard Tran Mills 693938d9b04SRichard Tran Mills Collective on MPI_Comm 694938d9b04SRichard Tran Mills 695938d9b04SRichard Tran Mills Input Parameters: 696938d9b04SRichard Tran Mills + comm - MPI communicator, set to PETSC_COMM_SELF 697938d9b04SRichard Tran Mills . m - number of rows 698938d9b04SRichard Tran Mills . n - number of columns 699938d9b04SRichard Tran Mills . nz - number of nonzeros per row (same for all rows) 700938d9b04SRichard Tran Mills - nnz - array containing the number of nonzeros in the various rows 701938d9b04SRichard Tran Mills (possibly different for each row) or NULL 702938d9b04SRichard Tran Mills 703938d9b04SRichard Tran Mills Output Parameter: 704938d9b04SRichard Tran Mills . A - the matrix 705938d9b04SRichard Tran Mills 706938d9b04SRichard Tran Mills Notes: 707938d9b04SRichard Tran Mills If nnz is given then nz is ignored 708938d9b04SRichard Tran Mills 709938d9b04SRichard Tran Mills Level: intermediate 710938d9b04SRichard Tran Mills 711938d9b04SRichard Tran Mills .keywords: matrix, cray, sparse, parallel 712938d9b04SRichard Tran Mills 713938d9b04SRichard Tran Mills .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues() 714938d9b04SRichard Tran Mills @*/ 715938d9b04SRichard Tran Mills PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 716938d9b04SRichard Tran Mills { 717938d9b04SRichard Tran Mills PetscErrorCode ierr; 718938d9b04SRichard Tran Mills 719938d9b04SRichard Tran Mills PetscFunctionBegin; 720938d9b04SRichard Tran Mills ierr = MatCreate(comm,A);CHKERRQ(ierr); 721938d9b04SRichard Tran Mills ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 722938d9b04SRichard Tran Mills ierr = MatSetType(*A,MATSEQAIJPERM);CHKERRQ(ierr); 723938d9b04SRichard Tran Mills ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr); 724938d9b04SRichard Tran Mills PetscFunctionReturn(0); 725938d9b04SRichard Tran Mills } 726938d9b04SRichard Tran Mills 727938d9b04SRichard Tran Mills PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A) 728938d9b04SRichard Tran Mills { 729938d9b04SRichard Tran Mills PetscErrorCode ierr; 730938d9b04SRichard Tran Mills 731938d9b04SRichard Tran Mills PetscFunctionBegin; 732938d9b04SRichard Tran Mills ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr); 733938d9b04SRichard Tran Mills ierr = MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 734938d9b04SRichard Tran Mills PetscFunctionReturn(0); 735938d9b04SRichard Tran Mills } 736938d9b04SRichard Tran Mills 737