1a4963045SJacob Faibussowitsch #pragma once 2d4002b98SHong Zhang 3d4002b98SHong Zhang #include <petsc/private/matimpl.h> 4eec179cfSJacob Faibussowitsch #include <petsc/private/hashmapi.h> 5d4002b98SHong Zhang 6d4002b98SHong Zhang /* 74e58db63SHong Zhang For NVIDIA GPUs each slice should be padded to the boundary of 16 elements for best performance. 84e58db63SHong Zhang The optimal memory alignment in device memory is 128 bytes, 64 bytes, 32 bytes for double precision, single precision and half precision. 94e58db63SHong Zhang */ 104e58db63SHong Zhang #if defined(PETSC_HAVE_DEVICE) 114e58db63SHong Zhang #define DEVICE_MEM_ALIGN 16 124e58db63SHong Zhang #endif 134e58db63SHong Zhang 144e58db63SHong Zhang /* 15d4002b98SHong Zhang Struct header for SeqSELL matrix format 16d4002b98SHong Zhang */ 17d4002b98SHong Zhang #define SEQSELLHEADER(datatype) \ 18d4002b98SHong Zhang PetscBool roworiented; /* if true, row-oriented input, default */ \ 19d4002b98SHong Zhang PetscInt nonew; /* 1 don't add new nonzeros, -1 generate error on new */ \ 20d4002b98SHong Zhang PetscInt nounused; /* -1 generate error on unused space */ \ 21d4002b98SHong Zhang PetscBool singlemalloc; /* if true a, i, and j have been obtained with one big malloc */ \ 22d4002b98SHong Zhang PetscInt maxallocmat; /* max allocated space for the matrix */ \ 23d4002b98SHong Zhang PetscInt maxallocrow; /* max allocated space for each row */ \ 24d4002b98SHong Zhang PetscInt nz; /* actual nonzeros */ \ 25d4002b98SHong Zhang PetscInt rlenmax; /* max actual row length, rmax cannot exceed maxallocrow */ \ 26d4002b98SHong Zhang PetscInt *rlen; /* actual length of each row (padding zeros excluded) */ \ 27d4002b98SHong Zhang PetscBool free_rlen; /* free rlen array ? */ \ 28d4002b98SHong Zhang PetscInt reallocs; /* number of mallocs done during MatSetValues() \ 29d4002b98SHong Zhang as more values are set than were prealloced */ \ 30d4002b98SHong Zhang PetscBool keepnonzeropattern; /* keeps matrix structure same in calls to MatZeroRows()*/ \ 31d4002b98SHong Zhang PetscBool ignorezeroentries; \ 32d4002b98SHong Zhang PetscBool free_colidx; /* free the column indices colidx when the matrix is destroyed */ \ 33d4002b98SHong Zhang PetscBool free_val; /* free the numerical values when matrix is destroy */ \ 34d4002b98SHong Zhang PetscInt *colidx; /* column index */ \ 35d4002b98SHong Zhang PetscInt *diag; /* pointers to diagonal elements */ \ 36d4002b98SHong Zhang PetscInt nonzerorowcnt; /* how many rows have nonzero entries */ \ 37d4002b98SHong Zhang PetscBool free_diag; /* free diag ? */ \ 38d4002b98SHong Zhang datatype *val; /* elements including nonzeros and padding zeros */ \ 39d4002b98SHong Zhang PetscScalar *solve_work; /* work space used in MatSolve */ \ 40d4002b98SHong Zhang IS row, col, icol; /* index sets, used for reorderings */ \ 41d4002b98SHong Zhang PetscBool pivotinblocks; /* pivot inside factorization of each diagonal block */ \ 42d4002b98SHong Zhang Mat parent; /* set if this matrix was formed with MatDuplicate(...,MAT_SHARE_NONZERO_PATTERN,....); 43d4002b98SHong Zhang means that this shares some data structures with the parent including diag, ilen, imax, i, j */ \ 44d4002b98SHong Zhang PetscInt *sliidx; /* slice index */ \ 456108893eSStefano Zampini PetscInt totalslices; /* total number of slices */ \ 4607e43b41SHong Zhang PetscInt sliceheight; /* slice height */ \ 4707e43b41SHong Zhang PetscReal fillratio; /* ratio of number of padded zeros over total number of elements */ \ 4807e43b41SHong Zhang PetscReal avgslicewidth; /* average slice width */ \ 4907e43b41SHong Zhang PetscInt maxslicewidth; /* maximum slice width */ \ 50b921024eSHong Zhang PetscReal varslicesize; /* variance of slice size */ \ 5107e43b41SHong Zhang PetscInt *sliperm; /* slice permutation array, CUDA only */ \ 5207e43b41SHong Zhang PetscInt totalblocks; /* total number of blocks, CUDA only */ \ 5307e43b41SHong Zhang PetscInt *blockidx; /* block index, CUDA only */ \ 5407e43b41SHong Zhang PetscInt *block_row_map; /* starting row of the current block, CUDA only */ \ 5590d2215bSHong Zhang PetscInt chunksize; /* chunk size, CUDA only */ \ 5690d2215bSHong Zhang PetscInt totalchunks; /* total number of chunks, CUDA only */ \ 57baca6076SPierre Jolivet PetscInt *chunk_slice_map; /* starting slice of the current chunk, CUDA only */ \ 586108893eSStefano Zampini PetscInt *getrowcols; /* workarray for MatGetRow_SeqSELL */ \ 599371c9d4SSatish Balay PetscScalar *getrowvals /* workarray for MatGetRow_SeqSELL */ 60d4002b98SHong Zhang 61d4002b98SHong Zhang typedef struct { 62d4002b98SHong Zhang SEQSELLHEADER(MatScalar); 63d4002b98SHong Zhang MatScalar *saved_values; /* location for stashing nonzero values of matrix */ 64d4002b98SHong Zhang PetscScalar *idiag, *mdiag, *ssor_work; /* inverse of diagonal entries, diagonal values and workspace for Eisenstat trick */ 65d4002b98SHong Zhang PetscBool idiagvalid; /* current idiag[] and mdiag[] are valid */ 66d4002b98SHong Zhang PetscScalar fshift, omega; /* last used omega and fshift */ 67d4002b98SHong Zhang ISColoring coloring; /* set with MatADSetColoring() used by MatADSetValues() */ 68d4002b98SHong Zhang } Mat_SeqSELL; 69d4002b98SHong Zhang 70d4002b98SHong Zhang /* 71d4002b98SHong Zhang Frees the arrays from the XSELLPACK matrix type 72d4002b98SHong Zhang */ 73d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatSeqXSELLFreeSELL(Mat AA, MatScalar **val, PetscInt **colidx) 74d71ae5a4SJacob Faibussowitsch { 75d4002b98SHong Zhang Mat_SeqSELL *A = (Mat_SeqSELL *)AA->data; 76d4002b98SHong Zhang if (A->singlemalloc) { 779566063dSJacob Faibussowitsch PetscCall(PetscFree2(*val, *colidx)); 78d4002b98SHong Zhang } else { 799566063dSJacob Faibussowitsch if (A->free_val) PetscCall(PetscFree(*val)); 809566063dSJacob Faibussowitsch if (A->free_colidx) PetscCall(PetscFree(*colidx)); 81d4002b98SHong Zhang } 823ba16761SJacob Faibussowitsch return PETSC_SUCCESS; 83d4002b98SHong Zhang } 84d4002b98SHong Zhang 854e58db63SHong Zhang #define MatSeqXSELLReallocateSELL(Amat, AM, BS2, WIDTH, SIDX, SH, SID, ROW, COL, COLIDX, VAL, CP, VP, NONEW, datatype, MUL) \ 86a8f51744SPierre Jolivet do { \ 8707e43b41SHong Zhang if (WIDTH >= (SIDX[SID + 1] - SIDX[SID]) / SH) { \ 88d4002b98SHong Zhang Mat_SeqSELL *Ain = (Mat_SeqSELL *)Amat->data; \ 892d1451d4SHong Zhang /* there is no extra room in row, therefore enlarge 1 slice column */ \ 904e58db63SHong Zhang PetscInt new_size = Ain->maxallocmat + SH * MUL, *new_colidx; \ 91d4002b98SHong Zhang datatype *new_val; \ 92d4002b98SHong Zhang \ 9308401ef6SPierre Jolivet PetscCheck(NONEW != -2, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "New nonzero at (%" PetscInt_FMT ",%" PetscInt_FMT ") caused a malloc\nUse MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE) to turn off this check", ROW, COL); \ 94d4002b98SHong Zhang /* malloc new storage space */ \ 959566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(BS2 *new_size, &new_val, BS2 *new_size, &new_colidx)); \ 96d4002b98SHong Zhang \ 97d4002b98SHong Zhang /* copy over old data into new slots by two steps: one step for data before the current slice and the other for the rest */ \ 989566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(new_val, VAL, SIDX[SID + 1])); \ 999566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(new_colidx, COLIDX, SIDX[SID + 1])); \ 1004e58db63SHong Zhang PetscCall(PetscArraycpy(new_val + SIDX[SID + 1] + SH * MUL, VAL + SIDX[SID + 1], SIDX[Ain->totalslices] - SIDX[SID + 1])); \ 1014e58db63SHong Zhang PetscCall(PetscArraycpy(new_colidx + SIDX[SID + 1] + SH * MUL, COLIDX + SIDX[SID + 1], SIDX[Ain->totalslices] - SIDX[SID + 1])); \ 102d4002b98SHong Zhang /* update slice_idx */ \ 1034e58db63SHong Zhang for (ii = SID + 1; ii <= Ain->totalslices; ii++) { SIDX[ii] += SH * MUL; } \ 104baca6076SPierre Jolivet /* update pointers. Notice that they point to the FIRST position of the row */ \ 10507e43b41SHong Zhang CP = new_colidx + SIDX[SID] + (ROW % SH); \ 10607e43b41SHong Zhang VP = new_val + SIDX[SID] + (ROW % SH); \ 107d4002b98SHong Zhang /* free up old matrix storage */ \ 1089566063dSJacob Faibussowitsch PetscCall(MatSeqXSELLFreeSELL(A, &Ain->val, &Ain->colidx)); \ 109d4002b98SHong Zhang Ain->val = (MatScalar *)new_val; \ 110d4002b98SHong Zhang Ain->colidx = new_colidx; \ 111d4002b98SHong Zhang Ain->singlemalloc = PETSC_TRUE; \ 112d4002b98SHong Zhang Ain->maxallocmat = new_size; \ 113d4002b98SHong Zhang Ain->reallocs++; \ 1144e58db63SHong Zhang if (WIDTH >= Ain->maxallocrow) Ain->maxallocrow += MUL; \ 115d4002b98SHong Zhang if (WIDTH >= Ain->rlenmax) Ain->rlenmax++; \ 116a8f51744SPierre Jolivet } \ 117a8f51744SPierre Jolivet } while (0) 118d4002b98SHong Zhang 119d4002b98SHong Zhang #define MatSetValue_SeqSELL_Private(A, row, col, value, addv, orow, ocol, cp, vp, lastcol, low, high) \ 120a8f51744SPierre Jolivet do { \ 121d4002b98SHong Zhang Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; \ 122d4002b98SHong Zhang found = PETSC_FALSE; \ 123d4002b98SHong Zhang if (col <= lastcol) low = 0; \ 124d4002b98SHong Zhang else high = a->rlen[row]; \ 125d4002b98SHong Zhang lastcol = col; \ 126d4002b98SHong Zhang while (high - low > 5) { \ 127d4002b98SHong Zhang t = (low + high) / 2; \ 12807e43b41SHong Zhang if (*(cp + a->sliceheight * t) > col) high = t; \ 129d4002b98SHong Zhang else low = t; \ 130d4002b98SHong Zhang } \ 131d4002b98SHong Zhang for (_i = low; _i < high; _i++) { \ 13207e43b41SHong Zhang if (*(cp + a->sliceheight * _i) > col) break; \ 13307e43b41SHong Zhang if (*(cp + a->sliceheight * _i) == col) { \ 13407e43b41SHong Zhang if (addv == ADD_VALUES) *(vp + a->sliceheight * _i) += value; \ 13507e43b41SHong Zhang else *(vp + a->sliceheight * _i) = value; \ 136d4002b98SHong Zhang found = PETSC_TRUE; \ 137d4002b98SHong Zhang break; \ 138d4002b98SHong Zhang } \ 139d4002b98SHong Zhang } \ 140d4002b98SHong Zhang if (!found) { \ 14108401ef6SPierre Jolivet PetscCheck(a->nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero at global row/column (%" PetscInt_FMT ", %" PetscInt_FMT ") into matrix", orow, ocol); \ 14207e43b41SHong Zhang if (a->nonew != 1 && !(value == 0.0 && a->ignorezeroentries) && a->rlen[row] >= (a->sliidx[row / a->sliceheight + 1] - a->sliidx[row / a->sliceheight]) / a->sliceheight) { \ 1432d1451d4SHong Zhang /* there is no extra room in row, therefore enlarge 1 slice column */ \ 14407e43b41SHong Zhang if (a->maxallocmat < a->sliidx[a->totalslices] + a->sliceheight) { \ 145d4002b98SHong Zhang /* allocates a larger array for the XSELL matrix types; only extend the current slice by one more column. */ \ 14607e43b41SHong Zhang PetscInt new_size = a->maxallocmat + a->sliceheight, *new_colidx; \ 147d4002b98SHong Zhang MatScalar *new_val; \ 14808401ef6SPierre Jolivet PetscCheck(a->nonew != -2, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "New nonzero at (%" PetscInt_FMT ",%" PetscInt_FMT ") caused a malloc\nUse MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_FALSE) to turn off this check", orow, ocol); \ 149d4002b98SHong Zhang /* malloc new storage space */ \ 1509566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(new_size, &new_val, new_size, &new_colidx)); \ 151d4002b98SHong Zhang /* copy over old data into new slots by two steps: one step for data before the current slice and the other for the rest */ \ 15207e43b41SHong Zhang PetscCall(PetscArraycpy(new_val, a->val, a->sliidx[row / a->sliceheight + 1])); \ 15307e43b41SHong Zhang PetscCall(PetscArraycpy(new_colidx, a->colidx, a->sliidx[row / a->sliceheight + 1])); \ 154*8e3a54c0SPierre Jolivet PetscCall(PetscArraycpy(new_val + a->sliidx[row / a->sliceheight + 1] + a->sliceheight, PetscSafePointerPlusOffset(a->val, a->sliidx[row / a->sliceheight + 1]), a->sliidx[a->totalslices] - a->sliidx[row / a->sliceheight + 1])); \ 155*8e3a54c0SPierre Jolivet PetscCall(PetscArraycpy(new_colidx + a->sliidx[row / a->sliceheight + 1] + a->sliceheight, PetscSafePointerPlusOffset(a->colidx, a->sliidx[row / a->sliceheight + 1]), a->sliidx[a->totalslices] - a->sliidx[row / a->sliceheight + 1])); \ 156baca6076SPierre Jolivet /* update pointers. Notice that they point to the FIRST position of the row */ \ 15707e43b41SHong Zhang cp = new_colidx + a->sliidx[row / a->sliceheight] + (row % a->sliceheight); \ 15807e43b41SHong Zhang vp = new_val + a->sliidx[row / a->sliceheight] + (row % a->sliceheight); \ 159d4002b98SHong Zhang /* free up old matrix storage */ \ 1609566063dSJacob Faibussowitsch PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); \ 161d4002b98SHong Zhang a->val = (MatScalar *)new_val; \ 162d4002b98SHong Zhang a->colidx = new_colidx; \ 163d4002b98SHong Zhang a->singlemalloc = PETSC_TRUE; \ 164d4002b98SHong Zhang a->maxallocmat = new_size; \ 165d4002b98SHong Zhang a->reallocs++; \ 166d4002b98SHong Zhang } else { \ 167d4002b98SHong Zhang /* no need to reallocate, just shift the following slices to create space for the added slice column */ \ 16807e43b41SHong Zhang PetscCall(PetscArraymove(a->val + a->sliidx[row / a->sliceheight + 1] + a->sliceheight, a->val + a->sliidx[row / a->sliceheight + 1], a->sliidx[a->totalslices] - a->sliidx[row / a->sliceheight + 1])); \ 16907e43b41SHong Zhang PetscCall(PetscArraymove(a->colidx + a->sliidx[row / a->sliceheight + 1] + a->sliceheight, a->colidx + a->sliidx[row / a->sliceheight + 1], a->sliidx[a->totalslices] - a->sliidx[row / a->sliceheight + 1])); \ 170d4002b98SHong Zhang } \ 171d4002b98SHong Zhang /* update slice_idx */ \ 17207e43b41SHong Zhang for (ii = row / a->sliceheight + 1; ii <= a->totalslices; ii++) a->sliidx[ii] += a->sliceheight; \ 173d4002b98SHong Zhang if (a->rlen[row] >= a->maxallocrow) a->maxallocrow++; \ 174d4002b98SHong Zhang if (a->rlen[row] >= a->rlenmax) a->rlenmax++; \ 175d4002b98SHong Zhang } \ 176d4002b98SHong Zhang /* shift up all the later entries in this row */ \ 177d4002b98SHong Zhang for (ii = a->rlen[row] - 1; ii >= _i; ii--) { \ 17807e43b41SHong Zhang *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii); \ 17907e43b41SHong Zhang *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii); \ 180d4002b98SHong Zhang } \ 18107e43b41SHong Zhang *(cp + a->sliceheight * _i) = col; \ 18207e43b41SHong Zhang *(vp + a->sliceheight * _i) = value; \ 1839371c9d4SSatish Balay a->nz++; \ 1849371c9d4SSatish Balay a->rlen[row]++; \ 1859371c9d4SSatish Balay A->nonzerostate++; \ 1869371c9d4SSatish Balay low = _i + 1; \ 1879371c9d4SSatish Balay high++; \ 188d4002b98SHong Zhang } \ 189a8f51744SPierre Jolivet } while (0) 190d4002b98SHong Zhang 191d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat, PetscInt, const PetscInt[]); 192d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMult_SeqSELL(Mat, Vec, Vec); 193d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMultAdd_SeqSELL(Mat, Vec, Vec, Vec); 194d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMultTranspose_SeqSELL(Mat, Vec, Vec); 195d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat, Vec, Vec, Vec); 196d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMissingDiagonal_SeqSELL(Mat, PetscBool *, PetscInt *); 197d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatMarkDiagonal_SeqSELL(Mat); 198d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatInvertDiagonal_SeqSELL(Mat, PetscScalar, PetscScalar); 199d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatZeroEntries_SeqSELL(Mat); 200d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatDestroy_SeqSELL(Mat); 201d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSetOption_SeqSELL(Mat, MatOption, PetscBool); 202d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqSELL(Mat, Vec v); 203d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatGetValues_SeqSELL(Mat, PetscInt, const PetscInt[], PetscInt, const PetscInt[], PetscScalar[]); 204d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatView_SeqSELL(Mat, PetscViewer); 205d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatAssemblyEnd_SeqSELL(Mat, MatAssemblyType); 206d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatGetInfo_SeqSELL(Mat, MatInfoType, MatInfo *); 207d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSetValues_SeqSELL(Mat, PetscInt, const PetscInt[], PetscInt, const PetscInt[], const PetscScalar[], InsertMode); 208d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatCopy_SeqSELL(Mat, Mat, MatStructure); 209d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSetUp_SeqSELL(Mat); 210d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat, PetscScalar *[]); 211d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat, PetscScalar *[]); 212d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatShift_SeqSELL(Mat, PetscScalar); 213d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSOR_SeqSELL(Mat, Vec, PetscReal, MatSORType, PetscReal, PetscInt, PetscInt, Vec); 214d4002b98SHong Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat); 215d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatDuplicate_SeqSELL(Mat, MatDuplicateOption, Mat *); 216d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatEqual_SeqSELL(Mat, Mat, PetscBool *); 217d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat); 218d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat, MatType, MatReuse, Mat *); 219d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat, MatType, MatReuse, Mat *); 220d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatFDColoringCreate_SeqSELL(Mat, ISColoring, MatFDColoring); 221d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatFDColoringSetUp_SeqSELL(Mat, ISColoring, MatFDColoring); 222d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatGetColumnIJ_SeqSELL_Color(Mat, PetscInt, PetscBool, PetscBool, PetscInt *, const PetscInt *[], const PetscInt *[], PetscInt *[], PetscBool *); 223d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatRestoreColumnIJ_SeqSELL_Color(Mat, PetscInt, PetscBool, PetscBool, PetscInt *, const PetscInt *[], const PetscInt *[], PetscInt *[], PetscBool *); 224d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatConjugate_SeqSELL(Mat A); 225d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatScale_SeqSELL(Mat, PetscScalar); 226d4002b98SHong Zhang PETSC_INTERN PetscErrorCode MatDiagonalScale_SeqSELL(Mat, Vec, Vec); 227