1d4002b98SHong Zhang 26524c165SJacob Faibussowitsch #ifndef __SELL_H 3d4002b98SHong Zhang #define __SELL_H 4d4002b98SHong Zhang 5d4002b98SHong Zhang #include <petsc/private/matimpl.h> 6eec179cfSJacob Faibussowitsch #include <petsc/private/hashmapi.h> 7d4002b98SHong Zhang 8d4002b98SHong Zhang /* 94e58db63SHong Zhang For NVIDIA GPUs each slice should be padded to the boundary of 16 elements for best performance. 104e58db63SHong Zhang The optimal memory alignment in device memory is 128 bytes, 64 bytes, 32 bytes for double precision, single precision and half precision. 114e58db63SHong Zhang */ 124e58db63SHong Zhang #if defined(PETSC_HAVE_DEVICE) 134e58db63SHong Zhang #define DEVICE_MEM_ALIGN 16 144e58db63SHong Zhang #endif 154e58db63SHong Zhang 164e58db63SHong Zhang /* 17d4002b98SHong Zhang Struct header for SeqSELL matrix format 18d4002b98SHong Zhang */ 19d4002b98SHong Zhang #define SEQSELLHEADER(datatype) \ 20d4002b98SHong Zhang PetscBool roworiented; /* if true, row-oriented input, default */ \ 21d4002b98SHong Zhang PetscInt nonew; /* 1 don't add new nonzeros, -1 generate error on new */ \ 22d4002b98SHong Zhang PetscInt nounused; /* -1 generate error on unused space */ \ 23d4002b98SHong Zhang PetscBool singlemalloc; /* if true a, i, and j have been obtained with one big malloc */ \ 24d4002b98SHong Zhang PetscInt maxallocmat; /* max allocated space for the matrix */ \ 25d4002b98SHong Zhang PetscInt maxallocrow; /* max allocated space for each row */ \ 26d4002b98SHong Zhang PetscInt nz; /* actual nonzeros */ \ 27d4002b98SHong Zhang PetscInt rlenmax; /* max actual row length, rmax cannot exceed maxallocrow */ \ 28d4002b98SHong Zhang PetscInt *rlen; /* actual length of each row (padding zeros excluded) */ \ 29d4002b98SHong Zhang PetscBool free_rlen; /* free rlen array ? */ \ 30d4002b98SHong Zhang PetscInt reallocs; /* number of mallocs done during MatSetValues() \ 31d4002b98SHong Zhang as more values are set than were prealloced */ \ 32d4002b98SHong Zhang PetscBool keepnonzeropattern; /* keeps matrix structure same in calls to MatZeroRows()*/ \ 33d4002b98SHong Zhang PetscBool ignorezeroentries; \ 34d4002b98SHong Zhang PetscBool free_colidx; /* free the column indices colidx when the matrix is destroyed */ \ 35d4002b98SHong Zhang PetscBool free_val; /* free the numerical values when matrix is destroy */ \ 36d4002b98SHong Zhang PetscInt *colidx; /* column index */ \ 37d4002b98SHong Zhang PetscInt *diag; /* pointers to diagonal elements */ \ 38d4002b98SHong Zhang PetscInt nonzerorowcnt; /* how many rows have nonzero entries */ \ 39d4002b98SHong Zhang PetscBool free_diag; /* free diag ? */ \ 40d4002b98SHong Zhang datatype *val; /* elements including nonzeros and padding zeros */ \ 41d4002b98SHong Zhang PetscScalar *solve_work; /* work space used in MatSolve */ \ 42d4002b98SHong Zhang IS row, col, icol; /* index sets, used for reorderings */ \ 43d4002b98SHong Zhang PetscBool pivotinblocks; /* pivot inside factorization of each diagonal block */ \ 44d4002b98SHong Zhang Mat parent; /* set if this matrix was formed with MatDuplicate(...,MAT_SHARE_NONZERO_PATTERN,....); 45d4002b98SHong Zhang means that this shares some data structures with the parent including diag, ilen, imax, i, j */ \ 46d4002b98SHong Zhang PetscInt *sliidx; /* slice index */ \ 476108893eSStefano Zampini PetscInt totalslices; /* total number of slices */ \ 4807e43b41SHong Zhang PetscInt sliceheight; /* slice height */ \ 4907e43b41SHong Zhang PetscReal fillratio; /* ratio of number of padded zeros over total number of elements */ \ 5007e43b41SHong Zhang PetscReal avgslicewidth; /* average slice width */ \ 5107e43b41SHong Zhang PetscInt maxslicewidth; /* maximum slice width */ \ 52b921024eSHong Zhang PetscReal varslicesize; /* variance of slice size */ \ 5307e43b41SHong Zhang PetscInt *sliperm; /* slice permutation array, CUDA only */ \ 5407e43b41SHong Zhang PetscInt totalblocks; /* total number of blocks, CUDA only */ \ 5507e43b41SHong Zhang PetscInt *blockidx; /* block index, CUDA only */ \ 5607e43b41SHong Zhang PetscInt *block_row_map; /* starting row of the current block, CUDA only */ \ 5790d2215bSHong Zhang PetscInt chunksize; /* chunk size, CUDA only */ \ 5890d2215bSHong Zhang PetscInt totalchunks; /* total number of chunks, CUDA only */ \ 5990d2215bSHong Zhang PetscInt *chunk_slice_map; /* starting slice of the currect chunk, CUDA only */ \ 606108893eSStefano Zampini PetscInt *getrowcols; /* workarray for MatGetRow_SeqSELL */ \ 619371c9d4SSatish Balay PetscScalar *getrowvals /* workarray for MatGetRow_SeqSELL */ 62d4002b98SHong Zhang 63d4002b98SHong Zhang typedef struct { 64d4002b98SHong Zhang SEQSELLHEADER(MatScalar); 65d4002b98SHong Zhang MatScalar *saved_values; /* location for stashing nonzero values of matrix */ 66d4002b98SHong Zhang PetscScalar *idiag, *mdiag, *ssor_work; /* inverse of diagonal entries, diagonal values and workspace for Eisenstat trick */ 67d4002b98SHong Zhang PetscBool idiagvalid; /* current idiag[] and mdiag[] are valid */ 68d4002b98SHong Zhang PetscScalar fshift, omega; /* last used omega and fshift */ 69d4002b98SHong Zhang ISColoring coloring; /* set with MatADSetColoring() used by MatADSetValues() */ 70d4002b98SHong Zhang } Mat_SeqSELL; 71d4002b98SHong Zhang 72d4002b98SHong Zhang /* 73d4002b98SHong Zhang Frees the arrays from the XSELLPACK matrix type 74d4002b98SHong Zhang */ 75d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatSeqXSELLFreeSELL(Mat AA, MatScalar **val, PetscInt **colidx) 76d71ae5a4SJacob Faibussowitsch { 77d4002b98SHong Zhang Mat_SeqSELL *A = (Mat_SeqSELL *)AA->data; 78d4002b98SHong Zhang if (A->singlemalloc) { 799566063dSJacob Faibussowitsch PetscCall(PetscFree2(*val, *colidx)); 80d4002b98SHong Zhang } else { 819566063dSJacob Faibussowitsch if (A->free_val) PetscCall(PetscFree(*val)); 829566063dSJacob Faibussowitsch if (A->free_colidx) PetscCall(PetscFree(*colidx)); 83d4002b98SHong Zhang } 843ba16761SJacob Faibussowitsch return PETSC_SUCCESS; 85d4002b98SHong Zhang } 86d4002b98SHong Zhang 874e58db63SHong Zhang #define MatSeqXSELLReallocateSELL(Amat, AM, BS2, WIDTH, SIDX, SH, SID, ROW, COL, COLIDX, VAL, CP, VP, NONEW, datatype, MUL) \ 88*a8f51744SPierre Jolivet do { \ 8907e43b41SHong Zhang if (WIDTH >= (SIDX[SID + 1] - SIDX[SID]) / SH) { \ 90d4002b98SHong Zhang Mat_SeqSELL *Ain = (Mat_SeqSELL *)Amat->data; \ 912d1451d4SHong Zhang /* there is no extra room in row, therefore enlarge 1 slice column */ \ 924e58db63SHong Zhang PetscInt new_size = Ain->maxallocmat + SH * MUL, *new_colidx; \ 93d4002b98SHong Zhang datatype *new_val; \ 94d4002b98SHong Zhang \ 9508401ef6SPierre Jolivet PetscCheck(NONEW != -2, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "New nonzero at (%" PetscInt_FMT ",%" PetscInt_FMT ") caused a malloc\nUse MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE) to turn off this check", ROW, COL); \ 96d4002b98SHong Zhang /* malloc new storage space */ \ 979566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(BS2 *new_size, &new_val, BS2 *new_size, &new_colidx)); \ 98d4002b98SHong Zhang \ 99d4002b98SHong Zhang /* copy over old data into new slots by two steps: one step for data before the current slice and the other for the rest */ \ 1009566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(new_val, VAL, SIDX[SID + 1])); \ 1019566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(new_colidx, COLIDX, SIDX[SID + 1])); \ 1024e58db63SHong Zhang PetscCall(PetscArraycpy(new_val + SIDX[SID + 1] + SH * MUL, VAL + SIDX[SID + 1], SIDX[Ain->totalslices] - SIDX[SID + 1])); \ 1034e58db63SHong Zhang PetscCall(PetscArraycpy(new_colidx + SIDX[SID + 1] + SH * MUL, COLIDX + SIDX[SID + 1], SIDX[Ain->totalslices] - SIDX[SID + 1])); \ 104d4002b98SHong Zhang /* update slice_idx */ \ 1054e58db63SHong Zhang for (ii = SID + 1; ii <= Ain->totalslices; ii++) { SIDX[ii] += SH * MUL; } \ 1062d1451d4SHong Zhang /* update pointers. Notice that they point to the FIRST postion of the row */ \ 10707e43b41SHong Zhang CP = new_colidx + SIDX[SID] + (ROW % SH); \ 10807e43b41SHong Zhang VP = new_val + SIDX[SID] + (ROW % SH); \ 109d4002b98SHong Zhang /* free up old matrix storage */ \ 1109566063dSJacob Faibussowitsch PetscCall(MatSeqXSELLFreeSELL(A, &Ain->val, &Ain->colidx)); \ 111d4002b98SHong Zhang Ain->val = (MatScalar *)new_val; \ 112d4002b98SHong Zhang Ain->colidx = new_colidx; \ 113d4002b98SHong Zhang Ain->singlemalloc = PETSC_TRUE; \ 114d4002b98SHong Zhang Ain->maxallocmat = new_size; \ 115d4002b98SHong Zhang Ain->reallocs++; \ 1164e58db63SHong Zhang if (WIDTH >= Ain->maxallocrow) Ain->maxallocrow += MUL; \ 117d4002b98SHong Zhang if (WIDTH >= Ain->rlenmax) Ain->rlenmax++; \ 118*a8f51744SPierre Jolivet } \ 119*a8f51744SPierre Jolivet } while (0) 120d4002b98SHong Zhang 121d4002b98SHong Zhang #define MatSetValue_SeqSELL_Private(A, row, col, value, addv, orow, ocol, cp, vp, lastcol, low, high) \ 122*a8f51744SPierre Jolivet do { \ 123d4002b98SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; \ 124d4002b98SHong Zhang found = PETSC_FALSE; \ 125d4002b98SHong Zhang if (col <= lastcol) low = 0; \ 126d4002b98SHong Zhang else high = a->rlen[row]; \ 127d4002b98SHong Zhang lastcol = col; \ 128d4002b98SHong Zhang while (high - low > 5) { \ 129d4002b98SHong Zhang t = (low + high) / 2; \ 13007e43b41SHong Zhang if (*(cp + a->sliceheight * t) > col) high = t; \ 131d4002b98SHong Zhang else low = t; \ 132d4002b98SHong Zhang } \ 133d4002b98SHong Zhang for (_i = low; _i < high; _i++) { \ 13407e43b41SHong Zhang if (*(cp + a->sliceheight * _i) > col) break; \ 13507e43b41SHong Zhang if (*(cp + a->sliceheight * _i) == col) { \ 13607e43b41SHong Zhang if (addv == ADD_VALUES) *(vp + a->sliceheight * _i) += value; \ 13707e43b41SHong Zhang else *(vp + a->sliceheight * _i) = value; \ 138d4002b98SHong Zhang found = PETSC_TRUE; \ 139d4002b98SHong Zhang break; \ 140d4002b98SHong Zhang } \ 141d4002b98SHong Zhang } \ 142d4002b98SHong Zhang if (!found) { \ 14308401ef6SPierre Jolivet PetscCheck(a->nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \ 14407e43b41SHong Zhang if (a->nonew != 1 && !(value == 0.0 && a->ignorezeroentries) && a->rlen[row] >= (a->sliidx[row / a->sliceheight + 1] - a->sliidx[row / a->sliceheight]) / a->sliceheight) { \ 1452d1451d4SHong Zhang /* there is no extra room in row, therefore enlarge 1 slice column */ \ 14607e43b41SHong Zhang if (a->maxallocmat < a->sliidx[a->totalslices] + a->sliceheight) { \ 147d4002b98SHong Zhang /* allocates a larger array for the XSELL matrix types; only extend the current slice by one more column. */ \ 14807e43b41SHong Zhang PetscInt new_size = a->maxallocmat + a->sliceheight, *new_colidx; \ 149d4002b98SHong Zhang MatScalar *new_val; \ 15008401ef6SPierre Jolivet PetscCheck(a->nonew != -2, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "New nonzero at (%" PetscInt_FMT ",%" PetscInt_FMT ") caused a malloc\nUse MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE) to turn off this check", orow, ocol); \ 151d4002b98SHong Zhang /* malloc new storage space */ \ 1529566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(new_size, &new_val, new_size, &new_colidx)); \ 153d4002b98SHong Zhang /* copy over old data into new slots by two steps: one step for data before the current slice and the other for the rest */ \ 15407e43b41SHong Zhang PetscCall(PetscArraycpy(new_val, a->val, a->sliidx[row / a->sliceheight + 1])); \ 15507e43b41SHong Zhang PetscCall(PetscArraycpy(new_colidx, a->colidx, a->sliidx[row / a->sliceheight + 1])); \ 15607e43b41SHong Zhang PetscCall(PetscArraycpy(new_val + a->sliidx[row / a->sliceheight + 1] + a->sliceheight, a->val + a->sliidx[row / a->sliceheight + 1], a->sliidx[a->totalslices] - a->sliidx[row / a->sliceheight + 1])); \ 15707e43b41SHong Zhang PetscCall(PetscArraycpy(new_colidx + a->sliidx[row / a->sliceheight + 1] + a->sliceheight, a->colidx + a->sliidx[row / a->sliceheight + 1], a->sliidx[a->totalslices] - a->sliidx[row / a->sliceheight + 1])); \ 1582d1451d4SHong Zhang /* update pointers. Notice that they point to the FIRST postion of the row */ \ 15907e43b41SHong Zhang cp = new_colidx + a->sliidx[row / a->sliceheight] + (row % a->sliceheight); \ 16007e43b41SHong Zhang vp = new_val + a->sliidx[row / a->sliceheight] + (row % a->sliceheight); \ 161d4002b98SHong Zhang /* free up old matrix storage */ \ 1629566063dSJacob Faibussowitsch PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); \ 163d4002b98SHong Zhang a->val = (MatScalar *)new_val; \ 164d4002b98SHong Zhang a->colidx = new_colidx; \ 165d4002b98SHong Zhang a->singlemalloc = PETSC_TRUE; \ 166d4002b98SHong Zhang a->maxallocmat = new_size; \ 167d4002b98SHong Zhang a->reallocs++; \ 168d4002b98SHong Zhang } else { \ 169d4002b98SHong Zhang /* no need to reallocate, just shift the following slices to create space for the added slice column */ \ 17007e43b41SHong Zhang PetscCall(PetscArraymove(a->val + a->sliidx[row / a->sliceheight + 1] + a->sliceheight, a->val + a->sliidx[row / a->sliceheight + 1], a->sliidx[a->totalslices] - a->sliidx[row / a->sliceheight + 1])); \ 17107e43b41SHong Zhang PetscCall(PetscArraymove(a->colidx + a->sliidx[row / a->sliceheight + 1] + a->sliceheight, a->colidx + a->sliidx[row / a->sliceheight + 1], a->sliidx[a->totalslices] - a->sliidx[row / a->sliceheight + 1])); \ 172d4002b98SHong Zhang } \ 173d4002b98SHong Zhang /* update slice_idx */ \ 17407e43b41SHong Zhang for (ii = row / a->sliceheight + 1; ii <= a->totalslices; ii++) a->sliidx[ii] += a->sliceheight; \ 175d4002b98SHong Zhang if (a->rlen[row] >= a->maxallocrow) a->maxallocrow++; \ 176d4002b98SHong Zhang if (a->rlen[row] >= a->rlenmax) a->rlenmax++; \ 177d4002b98SHong Zhang } \ 178d4002b98SHong Zhang /* shift up all the later entries in this row */ \ 179d4002b98SHong Zhang for (ii = a->rlen[row] - 1; ii >= _i; ii--) { \ 18007e43b41SHong Zhang *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii); \ 18107e43b41SHong Zhang *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii); \ 182d4002b98SHong Zhang } \ 18307e43b41SHong Zhang *(cp + a->sliceheight * _i) = col; \ 18407e43b41SHong Zhang *(vp + a->sliceheight * _i) = value; \ 1859371c9d4SSatish Balay a->nz++; \ 1869371c9d4SSatish Balay a->rlen[row]++; \ 1879371c9d4SSatish Balay A->nonzerostate++; \ 1889371c9d4SSatish Balay low = _i + 1; \ 1899371c9d4SSatish Balay high++; \ 190d4002b98SHong Zhang } \ 191*a8f51744SPierre Jolivet } while (0) 192d4002b98SHong Zhang 193d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat, PetscInt, const PetscInt[]); 194d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMult_SeqSELL(Mat, Vec, Vec); 195d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMultAdd_SeqSELL(Mat, Vec, Vec, Vec); 196d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMultTranspose_SeqSELL(Mat, Vec, Vec); 197d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat, Vec, Vec, Vec); 198d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMissingDiagonal_SeqSELL(Mat, PetscBool *, PetscInt *); 199d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMarkDiagonal_SeqSELL(Mat); 200d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatInvertDiagonal_SeqSELL(Mat, PetscScalar, PetscScalar); 201d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatZeroEntries_SeqSELL(Mat); 202d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatDestroy_SeqSELL(Mat); 203d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSetOption_SeqSELL(Mat, MatOption, PetscBool); 204d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqSELL(Mat, Vec v); 205d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatGetValues_SeqSELL(Mat, PetscInt, const PetscInt[], PetscInt, const PetscInt[], PetscScalar[]); 206d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatView_SeqSELL(Mat, PetscViewer); 207d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatAssemblyEnd_SeqSELL(Mat, MatAssemblyType); 208d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatGetInfo_SeqSELL(Mat, MatInfoType, MatInfo *); 209d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSetValues_SeqSELL(Mat, PetscInt, const PetscInt[], PetscInt, const PetscInt[], const PetscScalar[], InsertMode); 210d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatCopy_SeqSELL(Mat, Mat, MatStructure); 211d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSetUp_SeqSELL(Mat); 212d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat, PetscScalar *[]); 213d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat, PetscScalar *[]); 214d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatShift_SeqSELL(Mat, PetscScalar); 215d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSOR_SeqSELL(Mat, Vec, PetscReal, MatSORType, PetscReal, PetscInt, PetscInt, Vec); 216d4002b98SHong Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat); 217d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatDuplicate_SeqSELL(Mat, MatDuplicateOption, Mat *); 218d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatEqual_SeqSELL(Mat, Mat, PetscBool *); 219d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat); 220d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat, MatType, MatReuse, Mat *); 221d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat, MatType, MatReuse, Mat *); 222d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatFDColoringCreate_SeqSELL(Mat, ISColoring, MatFDColoring); 223d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatFDColoringSetUp_SeqSELL(Mat, ISColoring, MatFDColoring); 224d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatGetColumnIJ_SeqSELL_Color(Mat, PetscInt, PetscBool, PetscBool, PetscInt *, const PetscInt *[], const PetscInt *[], PetscInt *[], PetscBool *); 225d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatRestoreColumnIJ_SeqSELL_Color(Mat, PetscInt, PetscBool, PetscBool, PetscInt *, const PetscInt *[], const PetscInt *[], PetscInt *[], PetscBool *); 226d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatConjugate_SeqSELL(Mat A); 227d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatScale_SeqSELL(Mat, PetscScalar); 228d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatDiagonalScale_SeqSELL(Mat, Vec, Vec); 229d4002b98SHong Zhang #endif 230