1d4002b98SHong Zhang 26524c165SJacob Faibussowitsch #ifndef __SELL_H 3d4002b98SHong Zhang #define __SELL_H 4d4002b98SHong Zhang 5d4002b98SHong Zhang #include <petsc/private/matimpl.h> 6eec179cfSJacob Faibussowitsch #include <petsc/private/hashmapi.h> 7d4002b98SHong Zhang 8d4002b98SHong Zhang /* 94e58db63SHong Zhang For NVIDIA GPUs each slice should be padded to the boundary of 16 elements for best performance. 104e58db63SHong Zhang The optimal memory alignment in device memory is 128 bytes, 64 bytes, 32 bytes for double precision, single precision and half precision. 114e58db63SHong Zhang */ 124e58db63SHong Zhang #if defined(PETSC_HAVE_DEVICE) 134e58db63SHong Zhang #define DEVICE_MEM_ALIGN 16 144e58db63SHong Zhang #endif 154e58db63SHong Zhang 164e58db63SHong Zhang /* 17d4002b98SHong Zhang Struct header for SeqSELL matrix format 18d4002b98SHong Zhang */ 19d4002b98SHong Zhang #define SEQSELLHEADER(datatype) \ 20d4002b98SHong Zhang PetscBool roworiented; /* if true, row-oriented input, default */ \ 21d4002b98SHong Zhang PetscInt nonew; /* 1 don't add new nonzeros, -1 generate error on new */ \ 22d4002b98SHong Zhang PetscInt nounused; /* -1 generate error on unused space */ \ 23d4002b98SHong Zhang PetscBool singlemalloc; /* if true a, i, and j have been obtained with one big malloc */ \ 24d4002b98SHong Zhang PetscInt maxallocmat; /* max allocated space for the matrix */ \ 25d4002b98SHong Zhang PetscInt maxallocrow; /* max allocated space for each row */ \ 26d4002b98SHong Zhang PetscInt nz; /* actual nonzeros */ \ 27d4002b98SHong Zhang PetscInt rlenmax; /* max actual row length, rmax cannot exceed maxallocrow */ \ 28d4002b98SHong Zhang PetscInt *rlen; /* actual length of each row (padding zeros excluded) */ \ 29d4002b98SHong Zhang PetscBool free_rlen; /* free rlen array ? */ \ 30d4002b98SHong Zhang PetscInt reallocs; /* number of mallocs done during MatSetValues() \ 31d4002b98SHong Zhang as more values are set than were prealloced */ \ 32d4002b98SHong Zhang PetscBool keepnonzeropattern; /* keeps matrix structure same in calls to MatZeroRows()*/ \ 33d4002b98SHong Zhang PetscBool ignorezeroentries; \ 34d4002b98SHong Zhang PetscBool free_colidx; /* free the column indices colidx when the matrix is destroyed */ \ 35d4002b98SHong Zhang PetscBool free_val; /* free the numerical values when matrix is destroy */ \ 36d4002b98SHong Zhang PetscInt *colidx; /* column index */ \ 37d4002b98SHong Zhang PetscInt *diag; /* pointers to diagonal elements */ \ 38d4002b98SHong Zhang PetscInt nonzerorowcnt; /* how many rows have nonzero entries */ \ 39d4002b98SHong Zhang PetscBool free_diag; /* free diag ? */ \ 40d4002b98SHong Zhang datatype *val; /* elements including nonzeros and padding zeros */ \ 41d4002b98SHong Zhang PetscScalar *solve_work; /* work space used in MatSolve */ \ 42d4002b98SHong Zhang IS row, col, icol; /* index sets, used for reorderings */ \ 43d4002b98SHong Zhang PetscBool pivotinblocks; /* pivot inside factorization of each diagonal block */ \ 44d4002b98SHong Zhang Mat parent; /* set if this matrix was formed with MatDuplicate(...,MAT_SHARE_NONZERO_PATTERN,....); 45d4002b98SHong Zhang means that this shares some data structures with the parent including diag, ilen, imax, i, j */ \ 46d4002b98SHong Zhang PetscInt *sliidx; /* slice index */ \ 476108893eSStefano Zampini PetscInt totalslices; /* total number of slices */ \ 4807e43b41SHong Zhang PetscInt sliceheight; /* slice height */ \ 4907e43b41SHong Zhang PetscReal fillratio; /* ratio of number of padded zeros over total number of elements */ \ 5007e43b41SHong Zhang PetscReal avgslicewidth; /* average slice width */ \ 5107e43b41SHong Zhang PetscInt maxslicewidth; /* maximum slice width */ \ 52*b921024eSHong Zhang PetscReal varslicesize; /* variance of slice size */ \ 5307e43b41SHong Zhang PetscInt *sliperm; /* slice permutation array, CUDA only */ \ 5407e43b41SHong Zhang PetscInt totalblocks; /* total number of blocks, CUDA only */ \ 5507e43b41SHong Zhang PetscInt *blockidx; /* block index, CUDA only */ \ 5607e43b41SHong Zhang PetscInt *block_row_map; /* starting row of the current block, CUDA only */ \ 5790d2215bSHong Zhang PetscInt chunksize; /* chunk size, CUDA only */ \ 5890d2215bSHong Zhang PetscInt totalchunks; /* total number of chunks, CUDA only */ \ 5990d2215bSHong Zhang PetscInt *chunk_slice_map; /* starting slice of the currect chunk, CUDA only */ \ 606108893eSStefano Zampini PetscInt *getrowcols; /* workarray for MatGetRow_SeqSELL */ \ 619371c9d4SSatish Balay PetscScalar *getrowvals /* workarray for MatGetRow_SeqSELL */ 62d4002b98SHong Zhang 63d4002b98SHong Zhang typedef struct { 64d4002b98SHong Zhang SEQSELLHEADER(MatScalar); 65d4002b98SHong Zhang MatScalar *saved_values; /* location for stashing nonzero values of matrix */ 66d4002b98SHong Zhang PetscScalar *idiag, *mdiag, *ssor_work; /* inverse of diagonal entries, diagonal values and workspace for Eisenstat trick */ 67d4002b98SHong Zhang PetscBool idiagvalid; /* current idiag[] and mdiag[] are valid */ 68d4002b98SHong Zhang PetscScalar fshift, omega; /* last used omega and fshift */ 69d4002b98SHong Zhang ISColoring coloring; /* set with MatADSetColoring() used by MatADSetValues() */ 70d4002b98SHong Zhang } Mat_SeqSELL; 71d4002b98SHong Zhang 72d4002b98SHong Zhang /* 73d4002b98SHong Zhang Frees the arrays from the XSELLPACK matrix type 74d4002b98SHong Zhang */ 75d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatSeqXSELLFreeSELL(Mat AA, MatScalar **val, PetscInt **colidx) 76d71ae5a4SJacob Faibussowitsch { 77d4002b98SHong Zhang Mat_SeqSELL *A = (Mat_SeqSELL *)AA->data; 78d4002b98SHong Zhang if (A->singlemalloc) { 799566063dSJacob Faibussowitsch PetscCall(PetscFree2(*val, *colidx)); 80d4002b98SHong Zhang } else { 819566063dSJacob Faibussowitsch if (A->free_val) PetscCall(PetscFree(*val)); 829566063dSJacob Faibussowitsch if (A->free_colidx) PetscCall(PetscFree(*colidx)); 83d4002b98SHong Zhang } 843ba16761SJacob Faibussowitsch return PETSC_SUCCESS; 85d4002b98SHong Zhang } 86d4002b98SHong Zhang 874e58db63SHong Zhang #define MatSeqXSELLReallocateSELL(Amat, AM, BS2, WIDTH, SIDX, SH, SID, ROW, COL, COLIDX, VAL, CP, VP, NONEW, datatype, MUL) \ 8807e43b41SHong Zhang if (WIDTH >= (SIDX[SID + 1] - SIDX[SID]) / SH) { \ 89d4002b98SHong Zhang Mat_SeqSELL *Ain = (Mat_SeqSELL *)Amat->data; \ 902d1451d4SHong Zhang /* there is no extra room in row, therefore enlarge 1 slice column */ \ 914e58db63SHong Zhang PetscInt new_size = Ain->maxallocmat + SH * MUL, *new_colidx; \ 92d4002b98SHong Zhang datatype *new_val; \ 93d4002b98SHong Zhang \ 9408401ef6SPierre Jolivet PetscCheck(NONEW != -2, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "New nonzero at (%" PetscInt_FMT ",%" PetscInt_FMT ") caused a malloc\nUse MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE) to turn off this check", ROW, COL); \ 95d4002b98SHong Zhang /* malloc new storage space */ \ 969566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(BS2 *new_size, &new_val, BS2 *new_size, &new_colidx)); \ 97d4002b98SHong Zhang \ 98d4002b98SHong Zhang /* copy over old data into new slots by two steps: one step for data before the current slice and the other for the rest */ \ 999566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(new_val, VAL, SIDX[SID + 1])); \ 1009566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(new_colidx, COLIDX, SIDX[SID + 1])); \ 1014e58db63SHong Zhang PetscCall(PetscArraycpy(new_val + SIDX[SID + 1] + SH * MUL, VAL + SIDX[SID + 1], SIDX[Ain->totalslices] - SIDX[SID + 1])); \ 1024e58db63SHong Zhang PetscCall(PetscArraycpy(new_colidx + SIDX[SID + 1] + SH * MUL, COLIDX + SIDX[SID + 1], SIDX[Ain->totalslices] - SIDX[SID + 1])); \ 103d4002b98SHong Zhang /* update slice_idx */ \ 1044e58db63SHong Zhang for (ii = SID + 1; ii <= Ain->totalslices; ii++) { SIDX[ii] += SH * MUL; } \ 1052d1451d4SHong Zhang /* update pointers. Notice that they point to the FIRST postion of the row */ \ 10607e43b41SHong Zhang CP = new_colidx + SIDX[SID] + (ROW % SH); \ 10707e43b41SHong Zhang VP = new_val + SIDX[SID] + (ROW % SH); \ 108d4002b98SHong Zhang /* free up old matrix storage */ \ 1099566063dSJacob Faibussowitsch PetscCall(MatSeqXSELLFreeSELL(A, &Ain->val, &Ain->colidx)); \ 110d4002b98SHong Zhang Ain->val = (MatScalar *)new_val; \ 111d4002b98SHong Zhang Ain->colidx = new_colidx; \ 112d4002b98SHong Zhang Ain->singlemalloc = PETSC_TRUE; \ 113d4002b98SHong Zhang Ain->maxallocmat = new_size; \ 114d4002b98SHong Zhang Ain->reallocs++; \ 1154e58db63SHong Zhang if (WIDTH >= Ain->maxallocrow) Ain->maxallocrow += MUL; \ 116d4002b98SHong Zhang if (WIDTH >= Ain->rlenmax) Ain->rlenmax++; \ 1179371c9d4SSatish Balay } 118d4002b98SHong Zhang 119d4002b98SHong Zhang #define MatSetValue_SeqSELL_Private(A, row, col, value, addv, orow, ocol, cp, vp, lastcol, low, high) \ 120d4002b98SHong Zhang { \ 121d4002b98SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; \ 122d4002b98SHong Zhang found = PETSC_FALSE; \ 123d4002b98SHong Zhang if (col <= lastcol) low = 0; \ 124d4002b98SHong Zhang else high = a->rlen[row]; \ 125d4002b98SHong Zhang lastcol = col; \ 126d4002b98SHong Zhang while (high - low > 5) { \ 127d4002b98SHong Zhang t = (low + high) / 2; \ 12807e43b41SHong Zhang if (*(cp + a->sliceheight * t) > col) high = t; \ 129d4002b98SHong Zhang else low = t; \ 130d4002b98SHong Zhang } \ 131d4002b98SHong Zhang for (_i = low; _i < high; _i++) { \ 13207e43b41SHong Zhang if (*(cp + a->sliceheight * _i) > col) break; \ 13307e43b41SHong Zhang if (*(cp + a->sliceheight * _i) == col) { \ 13407e43b41SHong Zhang if (addv == ADD_VALUES) *(vp + a->sliceheight * _i) += value; \ 13507e43b41SHong Zhang else *(vp + a->sliceheight * _i) = value; \ 136d4002b98SHong Zhang found = PETSC_TRUE; \ 137d4002b98SHong Zhang break; \ 138d4002b98SHong Zhang } \ 139d4002b98SHong Zhang } \ 140d4002b98SHong Zhang if (!found) { \ 14108401ef6SPierre Jolivet PetscCheck(a->nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \ 14207e43b41SHong Zhang if (a->nonew != 1 && !(value == 0.0 && a->ignorezeroentries) && a->rlen[row] >= (a->sliidx[row / a->sliceheight + 1] - a->sliidx[row / a->sliceheight]) / a->sliceheight) { \ 1432d1451d4SHong Zhang /* there is no extra room in row, therefore enlarge 1 slice column */ \ 14407e43b41SHong Zhang if (a->maxallocmat < a->sliidx[a->totalslices] + a->sliceheight) { \ 145d4002b98SHong Zhang /* allocates a larger array for the XSELL matrix types; only extend the current slice by one more column. */ \ 14607e43b41SHong Zhang PetscInt new_size = a->maxallocmat + a->sliceheight, *new_colidx; \ 147d4002b98SHong Zhang MatScalar *new_val; \ 14808401ef6SPierre Jolivet PetscCheck(a->nonew != -2, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "New nonzero at (%" PetscInt_FMT ",%" PetscInt_FMT ") caused a malloc\nUse MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE) to turn off this check", orow, ocol); \ 149d4002b98SHong Zhang /* malloc new storage space */ \ 1509566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(new_size, &new_val, new_size, &new_colidx)); \ 151d4002b98SHong Zhang /* copy over old data into new slots by two steps: one step for data before the current slice and the other for the rest */ \ 15207e43b41SHong Zhang PetscCall(PetscArraycpy(new_val, a->val, a->sliidx[row / a->sliceheight + 1])); \ 15307e43b41SHong Zhang PetscCall(PetscArraycpy(new_colidx, a->colidx, a->sliidx[row / a->sliceheight + 1])); \ 15407e43b41SHong Zhang PetscCall(PetscArraycpy(new_val + a->sliidx[row / a->sliceheight + 1] + a->sliceheight, a->val + a->sliidx[row / a->sliceheight + 1], a->sliidx[a->totalslices] - a->sliidx[row / a->sliceheight + 1])); \ 15507e43b41SHong Zhang PetscCall(PetscArraycpy(new_colidx + a->sliidx[row / a->sliceheight + 1] + a->sliceheight, a->colidx + a->sliidx[row / a->sliceheight + 1], a->sliidx[a->totalslices] - a->sliidx[row / a->sliceheight + 1])); \ 1562d1451d4SHong Zhang /* update pointers. Notice that they point to the FIRST postion of the row */ \ 15707e43b41SHong Zhang cp = new_colidx + a->sliidx[row / a->sliceheight] + (row % a->sliceheight); \ 15807e43b41SHong Zhang vp = new_val + a->sliidx[row / a->sliceheight] + (row % a->sliceheight); \ 159d4002b98SHong Zhang /* free up old matrix storage */ \ 1609566063dSJacob Faibussowitsch PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); \ 161d4002b98SHong Zhang a->val = (MatScalar *)new_val; \ 162d4002b98SHong Zhang a->colidx = new_colidx; \ 163d4002b98SHong Zhang a->singlemalloc = PETSC_TRUE; \ 164d4002b98SHong Zhang a->maxallocmat = new_size; \ 165d4002b98SHong Zhang a->reallocs++; \ 166d4002b98SHong Zhang } else { \ 167d4002b98SHong Zhang /* no need to reallocate, just shift the following slices to create space for the added slice column */ \ 16807e43b41SHong Zhang PetscCall(PetscArraymove(a->val + a->sliidx[row / a->sliceheight + 1] + a->sliceheight, a->val + a->sliidx[row / a->sliceheight + 1], a->sliidx[a->totalslices] - a->sliidx[row / a->sliceheight + 1])); \ 16907e43b41SHong Zhang PetscCall(PetscArraymove(a->colidx + a->sliidx[row / a->sliceheight + 1] + a->sliceheight, a->colidx + a->sliidx[row / a->sliceheight + 1], a->sliidx[a->totalslices] - a->sliidx[row / a->sliceheight + 1])); \ 170d4002b98SHong Zhang } \ 171d4002b98SHong Zhang /* update slice_idx */ \ 17207e43b41SHong Zhang for (ii = row / a->sliceheight + 1; ii <= a->totalslices; ii++) a->sliidx[ii] += a->sliceheight; \ 173d4002b98SHong Zhang if (a->rlen[row] >= a->maxallocrow) a->maxallocrow++; \ 174d4002b98SHong Zhang if (a->rlen[row] >= a->rlenmax) a->rlenmax++; \ 175d4002b98SHong Zhang } \ 176d4002b98SHong Zhang /* shift up all the later entries in this row */ \ 177d4002b98SHong Zhang for (ii = a->rlen[row] - 1; ii >= _i; ii--) { \ 17807e43b41SHong Zhang *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii); \ 17907e43b41SHong Zhang *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii); \ 180d4002b98SHong Zhang } \ 18107e43b41SHong Zhang *(cp + a->sliceheight * _i) = col; \ 18207e43b41SHong Zhang *(vp + a->sliceheight * _i) = value; \ 1839371c9d4SSatish Balay a->nz++; \ 1849371c9d4SSatish Balay a->rlen[row]++; \ 1859371c9d4SSatish Balay A->nonzerostate++; \ 1869371c9d4SSatish Balay low = _i + 1; \ 1879371c9d4SSatish Balay high++; \ 188d4002b98SHong Zhang } \ 1899371c9d4SSatish Balay } 190d4002b98SHong Zhang 191d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat, PetscInt, const PetscInt[]); 192d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMult_SeqSELL(Mat, Vec, Vec); 193d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMultAdd_SeqSELL(Mat, Vec, Vec, Vec); 194d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMultTranspose_SeqSELL(Mat, Vec, Vec); 195d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat, Vec, Vec, Vec); 196d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMissingDiagonal_SeqSELL(Mat, PetscBool *, PetscInt *); 197d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMarkDiagonal_SeqSELL(Mat); 198d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatInvertDiagonal_SeqSELL(Mat, PetscScalar, PetscScalar); 199d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatZeroEntries_SeqSELL(Mat); 200d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatDestroy_SeqSELL(Mat); 201d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSetOption_SeqSELL(Mat, MatOption, PetscBool); 202d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqSELL(Mat, Vec v); 203d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatGetValues_SeqSELL(Mat, PetscInt, const PetscInt[], PetscInt, const PetscInt[], PetscScalar[]); 204d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatView_SeqSELL(Mat, PetscViewer); 205d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatAssemblyEnd_SeqSELL(Mat, MatAssemblyType); 206d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatGetInfo_SeqSELL(Mat, MatInfoType, MatInfo *); 207d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSetValues_SeqSELL(Mat, PetscInt, const PetscInt[], PetscInt, const PetscInt[], const PetscScalar[], InsertMode); 208d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatCopy_SeqSELL(Mat, Mat, MatStructure); 209d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSetUp_SeqSELL(Mat); 210d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat, PetscScalar *[]); 211d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat, PetscScalar *[]); 212d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatShift_SeqSELL(Mat, PetscScalar); 213d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSOR_SeqSELL(Mat, Vec, PetscReal, MatSORType, PetscReal, PetscInt, PetscInt, Vec); 214d4002b98SHong Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat); 215d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatDuplicate_SeqSELL(Mat, MatDuplicateOption, Mat *); 216d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatEqual_SeqSELL(Mat, Mat, PetscBool *); 217d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat); 218d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat, MatType, MatReuse, Mat *); 219d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat, MatType, MatReuse, Mat *); 220d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatFDColoringCreate_SeqSELL(Mat, ISColoring, MatFDColoring); 221d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatFDColoringSetUp_SeqSELL(Mat, ISColoring, MatFDColoring); 222d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatGetColumnIJ_SeqSELL_Color(Mat, PetscInt, PetscBool, PetscBool, PetscInt *, const PetscInt *[], const PetscInt *[], PetscInt *[], PetscBool *); 223d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatRestoreColumnIJ_SeqSELL_Color(Mat, PetscInt, PetscBool, PetscBool, PetscInt *, const PetscInt *[], const PetscInt *[], PetscInt *[], PetscBool *); 224d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatConjugate_SeqSELL(Mat A); 225d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatScale_SeqSELL(Mat, PetscScalar); 226d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatDiagonalScale_SeqSELL(Mat, Vec, Vec); 227d4002b98SHong Zhang #endif 228