1 /* 2 Defines the basic matrix operations for the SELL matrix storage format. 3 */ 4 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 5 #include <petscblaslapack.h> 6 #include <petsc/private/kernels/blocktranspose.h> 7 8 static PetscBool cited = PETSC_FALSE; 9 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 10 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 11 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 12 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 13 " year = 2018\n" 14 "}\n"; 15 16 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 17 18 #include <immintrin.h> 19 20 #if !defined(_MM_SCALE_8) 21 #define _MM_SCALE_8 8 22 #endif 23 24 #if defined(__AVX512F__) 25 /* these do not work 26 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 27 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 28 */ 29 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 30 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 31 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 32 vec_vals = _mm512_loadu_pd(aval); \ 33 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 34 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 35 #elif defined(__AVX2__) && defined(__FMA__) 36 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 37 vec_vals = _mm256_loadu_pd(aval); \ 38 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 39 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 40 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 41 #endif 42 #endif /* PETSC_HAVE_IMMINTRIN_H */ 43 44 /*@ 45 MatSeqSELLSetPreallocation - For good matrix assembly performance 46 the user should preallocate the matrix storage by setting the parameter `nz` 47 (or the array `nnz`). 48 49 Collective 50 51 Input Parameters: 52 + B - The `MATSEQSELL` matrix 53 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided 54 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 55 56 Level: intermediate 57 58 Notes: 59 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 60 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 61 allocation. 62 63 You can call `MatGetInfo()` to get information on how effective the preallocation was; 64 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 65 You can also run with the option `-info` and look for messages with the string 66 malloc in them to see if additional memory allocation was needed. 67 68 Developer Notes: 69 Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 70 entries or columns indices. 71 72 The maximum number of nonzeos in any row should be as accurate as possible. 73 If it is underestimated, you will get bad performance due to reallocation 74 (`MatSeqXSELLReallocateSELL()`). 75 76 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 77 @*/ 78 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 79 { 80 PetscFunctionBegin; 81 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 82 PetscValidType(B, 1); 83 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 84 PetscFunctionReturn(PETSC_SUCCESS); 85 } 86 87 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 88 { 89 Mat_SeqSELL *b; 90 PetscInt i, j, totalslices; 91 #if defined(PETSC_HAVE_CUPM) 92 PetscInt rlenmax = 0; 93 #endif 94 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 95 96 PetscFunctionBegin; 97 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 98 if (maxallocrow == MAT_SKIP_ALLOCATION) { 99 skipallocation = PETSC_TRUE; 100 maxallocrow = 0; 101 } 102 103 PetscCall(PetscLayoutSetUp(B->rmap)); 104 PetscCall(PetscLayoutSetUp(B->cmap)); 105 106 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 107 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 108 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 109 if (rlen) { 110 for (i = 0; i < B->rmap->n; i++) { 111 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 112 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 113 } 114 } 115 116 B->preallocated = PETSC_TRUE; 117 118 b = (Mat_SeqSELL *)B->data; 119 120 if (!b->sliceheight) { /* not set yet */ 121 #if defined(PETSC_HAVE_CUPM) 122 b->sliceheight = 16; 123 #else 124 b->sliceheight = 8; 125 #endif 126 } 127 totalslices = PetscCeilInt(B->rmap->n, b->sliceheight); 128 b->totalslices = totalslices; 129 if (!skipallocation) { 130 if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n)); 131 132 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 133 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 134 } 135 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 136 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 137 else if (maxallocrow < 0) maxallocrow = 1; 138 #if defined(PETSC_HAVE_CUPM) 139 rlenmax = maxallocrow; 140 /* Pad the slice to DEVICE_MEM_ALIGN */ 141 while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++; 142 #endif 143 for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow; 144 } else { 145 #if defined(PETSC_HAVE_CUPM) 146 PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight; 147 #endif 148 maxallocrow = 0; 149 b->sliidx[0] = 0; 150 for (i = 1; i < totalslices; i++) { 151 b->sliidx[i] = 0; 152 for (j = 0; j < b->sliceheight; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); 153 #if defined(PETSC_HAVE_CUPM) 154 if (mul != 0) { /* Pad the slice to DEVICE_MEM_ALIGN if sliceheight < DEVICE_MEM_ALIGN */ 155 rlenmax = PetscMax(b->sliidx[i], rlenmax); 156 b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul; 157 } 158 #endif 159 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 160 PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i])); 161 } 162 /* last slice */ 163 b->sliidx[totalslices] = 0; 164 for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 165 #if defined(PETSC_HAVE_CUPM) 166 if (mul != 0) { 167 rlenmax = PetscMax(b->sliidx[i], rlenmax); 168 b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul; 169 } 170 #endif 171 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 172 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices]; 173 } 174 175 /* allocate space for val, colidx, rlen */ 176 /* FIXME: should B's old memory be unlogged? */ 177 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 178 /* FIXME: assuming an element of the bit array takes 8 bits */ 179 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 180 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 181 PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen)); 182 183 b->singlemalloc = PETSC_TRUE; 184 b->free_val = PETSC_TRUE; 185 b->free_colidx = PETSC_TRUE; 186 } else { 187 b->free_val = PETSC_FALSE; 188 b->free_colidx = PETSC_FALSE; 189 } 190 191 b->nz = 0; 192 b->maxallocrow = maxallocrow; 193 #if defined(PETSC_HAVE_CUPM) 194 b->rlenmax = rlenmax; 195 #else 196 b->rlenmax = maxallocrow; 197 #endif 198 b->maxallocmat = b->sliidx[totalslices]; 199 B->info.nz_unneeded = (double)b->maxallocmat; 200 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 201 PetscFunctionReturn(PETSC_SUCCESS); 202 } 203 204 static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 205 { 206 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 207 PetscInt shift; 208 209 PetscFunctionBegin; 210 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 211 if (nz) *nz = a->rlen[row]; 212 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); 213 if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); 214 if (idx) { 215 PetscInt j; 216 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j]; 217 *idx = a->getrowcols; 218 } 219 if (v) { 220 PetscInt j; 221 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j]; 222 *v = a->getrowvals; 223 } 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 228 { 229 PetscFunctionBegin; 230 PetscFunctionReturn(PETSC_SUCCESS); 231 } 232 233 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 234 { 235 Mat B; 236 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 237 PetscInt i; 238 239 PetscFunctionBegin; 240 if (reuse == MAT_REUSE_MATRIX) { 241 B = *newmat; 242 PetscCall(MatZeroEntries(B)); 243 } else { 244 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 245 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 246 PetscCall(MatSetType(B, MATSEQAIJ)); 247 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 248 } 249 250 for (i = 0; i < A->rmap->n; i++) { 251 PetscInt nz = 0, *cols = NULL; 252 PetscScalar *vals = NULL; 253 254 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 255 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 256 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 257 } 258 259 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 260 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 261 B->rmap->bs = A->rmap->bs; 262 263 if (reuse == MAT_INPLACE_MATRIX) { 264 PetscCall(MatHeaderReplace(A, &B)); 265 } else { 266 *newmat = B; 267 } 268 PetscFunctionReturn(PETSC_SUCCESS); 269 } 270 271 #include <../src/mat/impls/aij/seq/aij.h> 272 273 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 274 { 275 Mat B; 276 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 277 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 278 const PetscInt *cols; 279 const PetscScalar *vals; 280 281 PetscFunctionBegin; 282 if (reuse == MAT_REUSE_MATRIX) { 283 B = *newmat; 284 } else { 285 if (PetscDefined(USE_DEBUG) || !a->ilen) { 286 PetscCall(PetscMalloc1(m, &rowlengths)); 287 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 288 } 289 if (PetscDefined(USE_DEBUG) && a->ilen) { 290 PetscBool eq; 291 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 292 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 293 PetscCall(PetscFree(rowlengths)); 294 rowlengths = a->ilen; 295 } else if (a->ilen) rowlengths = a->ilen; 296 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 297 PetscCall(MatSetSizes(B, m, n, m, n)); 298 PetscCall(MatSetType(B, MATSEQSELL)); 299 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 300 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 301 } 302 303 for (row = 0; row < m; row++) { 304 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 305 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 306 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 307 } 308 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 309 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 310 B->rmap->bs = A->rmap->bs; 311 312 if (reuse == MAT_INPLACE_MATRIX) { 313 PetscCall(MatHeaderReplace(A, &B)); 314 } else { 315 *newmat = B; 316 } 317 PetscFunctionReturn(PETSC_SUCCESS); 318 } 319 320 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 321 { 322 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 323 PetscScalar *y; 324 const PetscScalar *x; 325 const MatScalar *aval = a->val; 326 PetscInt totalslices = a->totalslices; 327 const PetscInt *acolidx = a->colidx; 328 PetscInt i, j; 329 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 330 __m512d vec_x, vec_y, vec_vals; 331 __m256i vec_idx; 332 __mmask8 mask; 333 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 334 __m256i vec_idx2, vec_idx3, vec_idx4; 335 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 336 __m128i vec_idx; 337 __m256d vec_x, vec_y, vec_y2, vec_vals; 338 MatScalar yval; 339 PetscInt r, rows_left, row, nnz_in_row; 340 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 341 __m128d vec_x_tmp; 342 __m256d vec_x, vec_y, vec_y2, vec_vals; 343 MatScalar yval; 344 PetscInt r, rows_left, row, nnz_in_row; 345 #else 346 PetscInt k, sliceheight = a->sliceheight; 347 PetscScalar *sum; 348 #endif 349 350 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 351 #pragma disjoint(*x, *y, *aval) 352 #endif 353 354 PetscFunctionBegin; 355 PetscCall(VecGetArrayRead(xx, &x)); 356 PetscCall(VecGetArray(yy, &y)); 357 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 358 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 359 for (i = 0; i < totalslices; i++) { /* loop over slices */ 360 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 361 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 362 363 vec_y = _mm512_setzero_pd(); 364 vec_y2 = _mm512_setzero_pd(); 365 vec_y3 = _mm512_setzero_pd(); 366 vec_y4 = _mm512_setzero_pd(); 367 368 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 369 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 370 case 3: 371 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 372 acolidx += 8; 373 aval += 8; 374 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 375 acolidx += 8; 376 aval += 8; 377 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 378 acolidx += 8; 379 aval += 8; 380 j += 3; 381 break; 382 case 2: 383 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 384 acolidx += 8; 385 aval += 8; 386 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 387 acolidx += 8; 388 aval += 8; 389 j += 2; 390 break; 391 case 1: 392 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 393 acolidx += 8; 394 aval += 8; 395 j += 1; 396 break; 397 } 398 #pragma novector 399 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 400 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 401 acolidx += 8; 402 aval += 8; 403 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 404 acolidx += 8; 405 aval += 8; 406 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 407 acolidx += 8; 408 aval += 8; 409 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 410 acolidx += 8; 411 aval += 8; 412 } 413 414 vec_y = _mm512_add_pd(vec_y, vec_y2); 415 vec_y = _mm512_add_pd(vec_y, vec_y3); 416 vec_y = _mm512_add_pd(vec_y, vec_y4); 417 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 418 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 419 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 420 } else { 421 _mm512_storeu_pd(&y[8 * i], vec_y); 422 } 423 } 424 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 425 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 426 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 427 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 428 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 429 430 /* last slice may have padding rows. Don't use vectorization. */ 431 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 432 rows_left = A->rmap->n - 8 * i; 433 for (r = 0; r < rows_left; ++r) { 434 yval = (MatScalar)0; 435 row = 8 * i + r; 436 nnz_in_row = a->rlen[row]; 437 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 438 y[row] = yval; 439 } 440 break; 441 } 442 443 vec_y = _mm256_setzero_pd(); 444 vec_y2 = _mm256_setzero_pd(); 445 446 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 447 #pragma novector 448 #pragma unroll(2) 449 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 450 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 451 aval += 4; 452 acolidx += 4; 453 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 454 aval += 4; 455 acolidx += 4; 456 } 457 458 _mm256_storeu_pd(y + i * 8, vec_y); 459 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 460 } 461 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 462 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 463 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 464 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 465 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 466 467 vec_y = _mm256_setzero_pd(); 468 vec_y2 = _mm256_setzero_pd(); 469 470 /* last slice may have padding rows. Don't use vectorization. */ 471 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 472 rows_left = A->rmap->n - 8 * i; 473 for (r = 0; r < rows_left; ++r) { 474 yval = (MatScalar)0; 475 row = 8 * i + r; 476 nnz_in_row = a->rlen[row]; 477 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 478 y[row] = yval; 479 } 480 break; 481 } 482 483 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 484 #pragma novector 485 #pragma unroll(2) 486 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 487 vec_vals = _mm256_loadu_pd(aval); 488 vec_x_tmp = _mm_setzero_pd(); 489 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 490 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 491 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 492 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 493 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 494 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 495 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 496 aval += 4; 497 498 vec_vals = _mm256_loadu_pd(aval); 499 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 500 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 501 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 502 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 503 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 504 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 505 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 506 aval += 4; 507 } 508 509 _mm256_storeu_pd(y + i * 8, vec_y); 510 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 511 } 512 #else 513 PetscCall(PetscMalloc1(sliceheight, &sum)); 514 for (i = 0; i < totalslices; i++) { /* loop over slices */ 515 for (j = 0; j < sliceheight; j++) { 516 sum[j] = 0.0; 517 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 518 } 519 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */ 520 for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j]; 521 } else { 522 for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j]; 523 } 524 } 525 PetscCall(PetscFree(sum)); 526 #endif 527 528 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 529 PetscCall(VecRestoreArrayRead(xx, &x)); 530 PetscCall(VecRestoreArray(yy, &y)); 531 PetscFunctionReturn(PETSC_SUCCESS); 532 } 533 534 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 535 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 536 { 537 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 538 PetscScalar *y, *z; 539 const PetscScalar *x; 540 const MatScalar *aval = a->val; 541 PetscInt totalslices = a->totalslices; 542 const PetscInt *acolidx = a->colidx; 543 PetscInt i, j; 544 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 545 __m512d vec_x, vec_y, vec_vals; 546 __m256i vec_idx; 547 __mmask8 mask = 0; 548 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 549 __m256i vec_idx2, vec_idx3, vec_idx4; 550 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 551 __m128d vec_x_tmp; 552 __m256d vec_x, vec_y, vec_y2, vec_vals; 553 MatScalar yval; 554 PetscInt r, row, nnz_in_row; 555 #else 556 PetscInt k, sliceheight = a->sliceheight; 557 PetscScalar *sum; 558 #endif 559 560 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 561 #pragma disjoint(*x, *y, *aval) 562 #endif 563 564 PetscFunctionBegin; 565 if (!a->nz) { 566 PetscCall(VecCopy(yy, zz)); 567 PetscFunctionReturn(PETSC_SUCCESS); 568 } 569 PetscCall(VecGetArrayRead(xx, &x)); 570 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 571 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 572 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 573 for (i = 0; i < totalslices; i++) { /* loop over slices */ 574 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 575 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 576 577 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 578 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 579 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 580 } else { 581 vec_y = _mm512_loadu_pd(&y[8 * i]); 582 } 583 vec_y2 = _mm512_setzero_pd(); 584 vec_y3 = _mm512_setzero_pd(); 585 vec_y4 = _mm512_setzero_pd(); 586 587 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 588 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 589 case 3: 590 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 591 acolidx += 8; 592 aval += 8; 593 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 594 acolidx += 8; 595 aval += 8; 596 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 597 acolidx += 8; 598 aval += 8; 599 j += 3; 600 break; 601 case 2: 602 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 603 acolidx += 8; 604 aval += 8; 605 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 606 acolidx += 8; 607 aval += 8; 608 j += 2; 609 break; 610 case 1: 611 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 612 acolidx += 8; 613 aval += 8; 614 j += 1; 615 break; 616 } 617 #pragma novector 618 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 619 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 620 acolidx += 8; 621 aval += 8; 622 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 623 acolidx += 8; 624 aval += 8; 625 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 626 acolidx += 8; 627 aval += 8; 628 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 629 acolidx += 8; 630 aval += 8; 631 } 632 633 vec_y = _mm512_add_pd(vec_y, vec_y2); 634 vec_y = _mm512_add_pd(vec_y, vec_y3); 635 vec_y = _mm512_add_pd(vec_y, vec_y4); 636 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 637 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 638 } else { 639 _mm512_storeu_pd(&z[8 * i], vec_y); 640 } 641 } 642 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 643 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 644 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 645 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 646 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 647 648 /* last slice may have padding rows. Don't use vectorization. */ 649 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 650 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 651 row = 8 * i + r; 652 yval = (MatScalar)0.0; 653 nnz_in_row = a->rlen[row]; 654 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 655 z[row] = y[row] + yval; 656 } 657 break; 658 } 659 660 vec_y = _mm256_loadu_pd(y + 8 * i); 661 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 662 663 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 664 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 665 vec_vals = _mm256_loadu_pd(aval); 666 vec_x_tmp = _mm_setzero_pd(); 667 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 668 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 669 vec_x = _mm256_setzero_pd(); 670 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 671 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 672 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 673 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 674 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 675 aval += 4; 676 677 vec_vals = _mm256_loadu_pd(aval); 678 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 679 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 680 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 681 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 682 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 683 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 684 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 685 aval += 4; 686 } 687 688 _mm256_storeu_pd(z + i * 8, vec_y); 689 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 690 } 691 #else 692 PetscCall(PetscMalloc1(sliceheight, &sum)); 693 for (i = 0; i < totalslices; i++) { /* loop over slices */ 694 for (j = 0; j < sliceheight; j++) { 695 sum[j] = 0.0; 696 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 697 } 698 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 699 for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 700 } else { 701 for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 702 } 703 } 704 PetscCall(PetscFree(sum)); 705 #endif 706 707 PetscCall(PetscLogFlops(2.0 * a->nz)); 708 PetscCall(VecRestoreArrayRead(xx, &x)); 709 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 710 PetscFunctionReturn(PETSC_SUCCESS); 711 } 712 713 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 714 { 715 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 716 PetscScalar *y; 717 const PetscScalar *x; 718 const MatScalar *aval = a->val; 719 const PetscInt *acolidx = a->colidx; 720 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight; 721 722 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 723 #pragma disjoint(*x, *y, *aval) 724 #endif 725 726 PetscFunctionBegin; 727 if (A->symmetric == PETSC_BOOL3_TRUE) { 728 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 729 PetscFunctionReturn(PETSC_SUCCESS); 730 } 731 if (zz != yy) PetscCall(VecCopy(zz, yy)); 732 733 if (a->nz) { 734 PetscCall(VecGetArrayRead(xx, &x)); 735 PetscCall(VecGetArray(yy, &y)); 736 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 737 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 738 for (r = 0; r < (A->rmap->n % sliceheight); ++r) { 739 row = sliceheight * i + r; 740 nnz_in_row = a->rlen[row]; 741 for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row]; 742 } 743 break; 744 } 745 for (r = 0; r < sliceheight; ++r) 746 for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r]; 747 } 748 PetscCall(PetscLogFlops(2.0 * a->nz)); 749 PetscCall(VecRestoreArrayRead(xx, &x)); 750 PetscCall(VecRestoreArray(yy, &y)); 751 } 752 PetscFunctionReturn(PETSC_SUCCESS); 753 } 754 755 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 756 { 757 PetscFunctionBegin; 758 if (A->symmetric == PETSC_BOOL3_TRUE) { 759 PetscCall(MatMult_SeqSELL(A, xx, yy)); 760 } else { 761 PetscCall(VecSet(yy, 0.0)); 762 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 763 } 764 PetscFunctionReturn(PETSC_SUCCESS); 765 } 766 767 /* 768 Checks for missing diagonals 769 */ 770 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 771 { 772 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 773 PetscInt *diag, i; 774 775 PetscFunctionBegin; 776 *missing = PETSC_FALSE; 777 if (A->rmap->n > 0 && !a->colidx) { 778 *missing = PETSC_TRUE; 779 if (d) *d = 0; 780 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 781 } else { 782 diag = a->diag; 783 for (i = 0; i < A->rmap->n; i++) { 784 if (diag[i] == -1) { 785 *missing = PETSC_TRUE; 786 if (d) *d = i; 787 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 788 break; 789 } 790 } 791 } 792 PetscFunctionReturn(PETSC_SUCCESS); 793 } 794 795 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 796 { 797 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 798 PetscInt i, j, m = A->rmap->n, shift; 799 800 PetscFunctionBegin; 801 if (!a->diag) { 802 PetscCall(PetscMalloc1(m, &a->diag)); 803 a->free_diag = PETSC_TRUE; 804 } 805 for (i = 0; i < m; i++) { /* loop over rows */ 806 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 807 a->diag[i] = -1; 808 for (j = 0; j < a->rlen[i]; j++) { 809 if (a->colidx[shift + a->sliceheight * j] == i) { 810 a->diag[i] = shift + a->sliceheight * j; 811 break; 812 } 813 } 814 } 815 PetscFunctionReturn(PETSC_SUCCESS); 816 } 817 818 /* 819 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 820 */ 821 PetscErrorCode MatInvertDiagonal_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 822 { 823 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 824 PetscInt i, *diag, m = A->rmap->n; 825 MatScalar *val = a->val; 826 PetscScalar *idiag, *mdiag; 827 828 PetscFunctionBegin; 829 if (a->idiagvalid) PetscFunctionReturn(PETSC_SUCCESS); 830 PetscCall(MatMarkDiagonal_SeqSELL(A)); 831 diag = a->diag; 832 if (!a->idiag) { 833 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 834 val = a->val; 835 } 836 mdiag = a->mdiag; 837 idiag = a->idiag; 838 839 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 840 for (i = 0; i < m; i++) { 841 mdiag[i] = val[diag[i]]; 842 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 843 PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 844 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 845 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 846 A->factorerror_zeropivot_value = 0.0; 847 A->factorerror_zeropivot_row = i; 848 } 849 idiag[i] = 1.0 / val[diag[i]]; 850 } 851 PetscCall(PetscLogFlops(m)); 852 } else { 853 for (i = 0; i < m; i++) { 854 mdiag[i] = val[diag[i]]; 855 idiag[i] = omega / (fshift + val[diag[i]]); 856 } 857 PetscCall(PetscLogFlops(2.0 * m)); 858 } 859 a->idiagvalid = PETSC_TRUE; 860 PetscFunctionReturn(PETSC_SUCCESS); 861 } 862 863 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 864 { 865 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 866 867 PetscFunctionBegin; 868 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 869 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 870 PetscFunctionReturn(PETSC_SUCCESS); 871 } 872 873 PetscErrorCode MatDestroy_SeqSELL(Mat A) 874 { 875 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 876 877 PetscFunctionBegin; 878 PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz)); 879 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 880 PetscCall(ISDestroy(&a->row)); 881 PetscCall(ISDestroy(&a->col)); 882 PetscCall(PetscFree(a->diag)); 883 PetscCall(PetscFree(a->rlen)); 884 PetscCall(PetscFree(a->sliidx)); 885 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 886 PetscCall(PetscFree(a->solve_work)); 887 PetscCall(ISDestroy(&a->icol)); 888 PetscCall(PetscFree(a->saved_values)); 889 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 890 PetscCall(PetscFree(A->data)); 891 #if defined(PETSC_HAVE_CUPM) 892 PetscCall(PetscFree(a->chunk_slice_map)); 893 #endif 894 895 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 896 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 897 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 898 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 899 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 900 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 901 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 902 #if defined(PETSC_HAVE_CUDA) 903 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL)); 904 #endif 905 #if defined(PETSC_HAVE_HIP) 906 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellhip_C", NULL)); 907 #endif 908 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL)); 909 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL)); 910 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL)); 911 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL)); 912 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL)); 913 PetscFunctionReturn(PETSC_SUCCESS); 914 } 915 916 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 917 { 918 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 919 920 PetscFunctionBegin; 921 switch (op) { 922 case MAT_ROW_ORIENTED: 923 a->roworiented = flg; 924 break; 925 case MAT_KEEP_NONZERO_PATTERN: 926 a->keepnonzeropattern = flg; 927 break; 928 case MAT_NEW_NONZERO_LOCATIONS: 929 a->nonew = (flg ? 0 : 1); 930 break; 931 case MAT_NEW_NONZERO_LOCATION_ERR: 932 a->nonew = (flg ? -1 : 0); 933 break; 934 case MAT_NEW_NONZERO_ALLOCATION_ERR: 935 a->nonew = (flg ? -2 : 0); 936 break; 937 case MAT_UNUSED_NONZERO_LOCATION_ERR: 938 a->nounused = (flg ? -1 : 0); 939 break; 940 default: 941 break; 942 } 943 PetscFunctionReturn(PETSC_SUCCESS); 944 } 945 946 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 947 { 948 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 949 PetscInt i, j, n, shift; 950 PetscScalar *x, zero = 0.0; 951 952 PetscFunctionBegin; 953 PetscCall(VecGetLocalSize(v, &n)); 954 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 955 956 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 957 PetscInt *diag = a->diag; 958 PetscCall(VecGetArray(v, &x)); 959 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 960 PetscCall(VecRestoreArray(v, &x)); 961 PetscFunctionReturn(PETSC_SUCCESS); 962 } 963 964 PetscCall(VecSet(v, zero)); 965 PetscCall(VecGetArray(v, &x)); 966 for (i = 0; i < n; i++) { /* loop over rows */ 967 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 968 x[i] = 0; 969 for (j = 0; j < a->rlen[i]; j++) { 970 if (a->colidx[shift + a->sliceheight * j] == i) { 971 x[i] = a->val[shift + a->sliceheight * j]; 972 break; 973 } 974 } 975 } 976 PetscCall(VecRestoreArray(v, &x)); 977 PetscFunctionReturn(PETSC_SUCCESS); 978 } 979 980 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 981 { 982 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 983 const PetscScalar *l, *r; 984 PetscInt i, j, m, n, row; 985 986 PetscFunctionBegin; 987 if (ll) { 988 /* The local size is used so that VecMPI can be passed to this routine 989 by MatDiagonalScale_MPISELL */ 990 PetscCall(VecGetLocalSize(ll, &m)); 991 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 992 PetscCall(VecGetArrayRead(ll, &l)); 993 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 994 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 995 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 996 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row]; 997 } 998 } else { 999 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) a->val[j] *= l[a->sliceheight * i + row]; 1000 } 1001 } 1002 PetscCall(VecRestoreArrayRead(ll, &l)); 1003 PetscCall(PetscLogFlops(a->nz)); 1004 } 1005 if (rr) { 1006 PetscCall(VecGetLocalSize(rr, &n)); 1007 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1008 PetscCall(VecGetArrayRead(rr, &r)); 1009 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1010 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1011 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) { 1012 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]]; 1013 } 1014 } else { 1015 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 1016 } 1017 } 1018 PetscCall(VecRestoreArrayRead(rr, &r)); 1019 PetscCall(PetscLogFlops(a->nz)); 1020 } 1021 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1022 #if defined(PETSC_HAVE_CUPM) 1023 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1024 #endif 1025 PetscFunctionReturn(PETSC_SUCCESS); 1026 } 1027 1028 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1029 { 1030 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1031 PetscInt *cp, i, k, low, high, t, row, col, l; 1032 PetscInt shift; 1033 MatScalar *vp; 1034 1035 PetscFunctionBegin; 1036 for (k = 0; k < m; k++) { /* loop over requested rows */ 1037 row = im[k]; 1038 if (row < 0) continue; 1039 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1040 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */ 1041 cp = a->colidx + shift; /* pointer to the row */ 1042 vp = a->val + shift; /* pointer to the row */ 1043 for (l = 0; l < n; l++) { /* loop over requested columns */ 1044 col = in[l]; 1045 if (col < 0) continue; 1046 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1047 high = a->rlen[row]; 1048 low = 0; /* assume unsorted */ 1049 while (high - low > 5) { 1050 t = (low + high) / 2; 1051 if (*(cp + a->sliceheight * t) > col) high = t; 1052 else low = t; 1053 } 1054 for (i = low; i < high; i++) { 1055 if (*(cp + a->sliceheight * i) > col) break; 1056 if (*(cp + a->sliceheight * i) == col) { 1057 *v++ = *(vp + a->sliceheight * i); 1058 goto finished; 1059 } 1060 } 1061 *v++ = 0.0; 1062 finished:; 1063 } 1064 } 1065 PetscFunctionReturn(PETSC_SUCCESS); 1066 } 1067 1068 static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1069 { 1070 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1071 PetscInt i, j, m = A->rmap->n, shift; 1072 const char *name; 1073 PetscViewerFormat format; 1074 1075 PetscFunctionBegin; 1076 PetscCall(PetscViewerGetFormat(viewer, &format)); 1077 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1078 PetscInt nofinalvalue = 0; 1079 /* 1080 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) nofinalvalue = 1; 1081 */ 1082 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1083 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1084 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1085 #if defined(PETSC_USE_COMPLEX) 1086 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1087 #else 1088 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1089 #endif 1090 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1091 1092 for (i = 0; i < m; i++) { 1093 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1094 for (j = 0; j < a->rlen[i]; j++) { 1095 #if defined(PETSC_USE_COMPLEX) 1096 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1097 #else 1098 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j])); 1099 #endif 1100 } 1101 } 1102 /* 1103 if (nofinalvalue) { 1104 #if defined(PETSC_USE_COMPLEX) 1105 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1106 #else 1107 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1108 #endif 1109 } 1110 */ 1111 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1112 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1113 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1114 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1115 PetscFunctionReturn(PETSC_SUCCESS); 1116 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1117 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1118 for (i = 0; i < m; i++) { 1119 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1120 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1121 for (j = 0; j < a->rlen[i]; j++) { 1122 #if defined(PETSC_USE_COMPLEX) 1123 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1124 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1125 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1126 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1127 } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1128 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1129 } 1130 #else 1131 if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1132 #endif 1133 } 1134 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1135 } 1136 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1137 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1138 PetscInt cnt = 0, jcnt; 1139 PetscScalar value; 1140 #if defined(PETSC_USE_COMPLEX) 1141 PetscBool realonly = PETSC_TRUE; 1142 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1143 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1144 realonly = PETSC_FALSE; 1145 break; 1146 } 1147 } 1148 #endif 1149 1150 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1151 for (i = 0; i < m; i++) { 1152 jcnt = 0; 1153 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1154 for (j = 0; j < A->cmap->n; j++) { 1155 if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) { 1156 value = a->val[cnt++]; 1157 jcnt++; 1158 } else { 1159 value = 0.0; 1160 } 1161 #if defined(PETSC_USE_COMPLEX) 1162 if (realonly) { 1163 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1164 } else { 1165 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1166 } 1167 #else 1168 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1169 #endif 1170 } 1171 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1172 } 1173 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1174 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1175 PetscInt fshift = 1; 1176 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1177 #if defined(PETSC_USE_COMPLEX) 1178 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1179 #else 1180 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1181 #endif 1182 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1183 for (i = 0; i < m; i++) { 1184 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1185 for (j = 0; j < a->rlen[i]; j++) { 1186 #if defined(PETSC_USE_COMPLEX) 1187 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1188 #else 1189 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j])); 1190 #endif 1191 } 1192 } 1193 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1194 } else if (format == PETSC_VIEWER_NATIVE) { 1195 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1196 PetscInt row; 1197 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1198 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1199 #if defined(PETSC_USE_COMPLEX) 1200 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1201 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1202 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1203 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1204 } else { 1205 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1206 } 1207 #else 1208 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j])); 1209 #endif 1210 } 1211 } 1212 } else { 1213 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1214 if (A->factortype) { 1215 for (i = 0; i < m; i++) { 1216 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1217 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1218 /* L part */ 1219 for (j = shift; j < a->diag[i]; j += a->sliceheight) { 1220 #if defined(PETSC_USE_COMPLEX) 1221 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) { 1222 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1223 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) { 1224 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1225 } else { 1226 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1227 } 1228 #else 1229 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1230 #endif 1231 } 1232 /* diagonal */ 1233 j = a->diag[i]; 1234 #if defined(PETSC_USE_COMPLEX) 1235 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1236 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1237 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1238 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1239 } else { 1240 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1241 } 1242 #else 1243 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1 / a->val[j]))); 1244 #endif 1245 1246 /* U part */ 1247 for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) { 1248 #if defined(PETSC_USE_COMPLEX) 1249 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1250 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1251 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1252 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1253 } else { 1254 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1255 } 1256 #else 1257 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1258 #endif 1259 } 1260 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1261 } 1262 } else { 1263 for (i = 0; i < m; i++) { 1264 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1265 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1266 for (j = 0; j < a->rlen[i]; j++) { 1267 #if defined(PETSC_USE_COMPLEX) 1268 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1269 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1270 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1271 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1272 } else { 1273 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1274 } 1275 #else 1276 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1277 #endif 1278 } 1279 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1280 } 1281 } 1282 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1283 } 1284 PetscCall(PetscViewerFlush(viewer)); 1285 PetscFunctionReturn(PETSC_SUCCESS); 1286 } 1287 1288 #include <petscdraw.h> 1289 static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1290 { 1291 Mat A = (Mat)Aa; 1292 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1293 PetscInt i, j, m = A->rmap->n, shift; 1294 int color; 1295 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1296 PetscViewer viewer; 1297 PetscViewerFormat format; 1298 1299 PetscFunctionBegin; 1300 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1301 PetscCall(PetscViewerGetFormat(viewer, &format)); 1302 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1303 1304 /* loop over matrix elements drawing boxes */ 1305 1306 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1307 PetscDrawCollectiveBegin(draw); 1308 /* Blue for negative, Cyan for zero and Red for positive */ 1309 color = PETSC_DRAW_BLUE; 1310 for (i = 0; i < m; i++) { 1311 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1312 y_l = m - i - 1.0; 1313 y_r = y_l + 1.0; 1314 for (j = 0; j < a->rlen[i]; j++) { 1315 x_l = a->colidx[shift + a->sliceheight * j]; 1316 x_r = x_l + 1.0; 1317 if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue; 1318 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1319 } 1320 } 1321 color = PETSC_DRAW_CYAN; 1322 for (i = 0; i < m; i++) { 1323 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1324 y_l = m - i - 1.0; 1325 y_r = y_l + 1.0; 1326 for (j = 0; j < a->rlen[i]; j++) { 1327 x_l = a->colidx[shift + a->sliceheight * j]; 1328 x_r = x_l + 1.0; 1329 if (a->val[shift + a->sliceheight * j] != 0.) continue; 1330 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1331 } 1332 } 1333 color = PETSC_DRAW_RED; 1334 for (i = 0; i < m; i++) { 1335 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1336 y_l = m - i - 1.0; 1337 y_r = y_l + 1.0; 1338 for (j = 0; j < a->rlen[i]; j++) { 1339 x_l = a->colidx[shift + a->sliceheight * j]; 1340 x_r = x_l + 1.0; 1341 if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue; 1342 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1343 } 1344 } 1345 PetscDrawCollectiveEnd(draw); 1346 } else { 1347 /* use contour shading to indicate magnitude of values */ 1348 /* first determine max of all nonzero values */ 1349 PetscReal minv = 0.0, maxv = 0.0; 1350 PetscInt count = 0; 1351 PetscDraw popup; 1352 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1353 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1354 } 1355 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1356 PetscCall(PetscDrawGetPopup(draw, &popup)); 1357 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1358 1359 PetscDrawCollectiveBegin(draw); 1360 for (i = 0; i < m; i++) { 1361 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1362 y_l = m - i - 1.0; 1363 y_r = y_l + 1.0; 1364 for (j = 0; j < a->rlen[i]; j++) { 1365 x_l = a->colidx[shift + a->sliceheight * j]; 1366 x_r = x_l + 1.0; 1367 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1368 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1369 count++; 1370 } 1371 } 1372 PetscDrawCollectiveEnd(draw); 1373 } 1374 PetscFunctionReturn(PETSC_SUCCESS); 1375 } 1376 1377 #include <petscdraw.h> 1378 static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1379 { 1380 PetscDraw draw; 1381 PetscReal xr, yr, xl, yl, h, w; 1382 PetscBool isnull; 1383 1384 PetscFunctionBegin; 1385 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1386 PetscCall(PetscDrawIsNull(draw, &isnull)); 1387 if (isnull) PetscFunctionReturn(PETSC_SUCCESS); 1388 1389 xr = A->cmap->n; 1390 yr = A->rmap->n; 1391 h = yr / 10.0; 1392 w = xr / 10.0; 1393 xr += w; 1394 yr += h; 1395 xl = -w; 1396 yl = -h; 1397 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1398 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1399 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1400 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1401 PetscCall(PetscDrawSave(draw)); 1402 PetscFunctionReturn(PETSC_SUCCESS); 1403 } 1404 1405 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1406 { 1407 PetscBool isascii, isbinary, isdraw; 1408 1409 PetscFunctionBegin; 1410 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii)); 1411 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1412 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1413 if (isascii) { 1414 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1415 } else if (isbinary) { 1416 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1417 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1418 PetscFunctionReturn(PETSC_SUCCESS); 1419 } 1420 1421 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1422 { 1423 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1424 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1425 MatScalar *vp; 1426 #if defined(PETSC_HAVE_CUPM) 1427 PetscInt totalchunks = 0; 1428 #endif 1429 1430 PetscFunctionBegin; 1431 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 1432 /* To do: compress out the unused elements */ 1433 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1434 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1435 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1436 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1437 a->nonzerorowcnt = 0; 1438 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1439 for (i = 0; i < a->totalslices; ++i) { 1440 shift = a->sliidx[i]; /* starting index of the slice */ 1441 cp = PetscSafePointerPlusOffset(a->colidx, shift); /* pointer to the column indices of the slice */ 1442 vp = PetscSafePointerPlusOffset(a->val, shift); /* pointer to the nonzero values of the slice */ 1443 for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */ 1444 row = a->sliceheight * i + row_in_slice; 1445 nrow = a->rlen[row]; /* number of nonzeros in row */ 1446 /* 1447 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1448 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1449 */ 1450 lastcol = 0; 1451 if (nrow > 0) { /* nonempty row */ 1452 a->nonzerorowcnt++; 1453 lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1454 } else if (!row_in_slice) { /* first row of the correct slice is empty */ 1455 for (j = 1; j < a->sliceheight; j++) { 1456 if (a->rlen[a->sliceheight * i + j]) { 1457 lastcol = cp[j]; 1458 break; 1459 } 1460 } 1461 } else { 1462 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1463 } 1464 1465 for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) { 1466 cp[a->sliceheight * k + row_in_slice] = lastcol; 1467 vp[a->sliceheight * k + row_in_slice] = (MatScalar)0; 1468 } 1469 } 1470 } 1471 1472 A->info.mallocs += a->reallocs; 1473 a->reallocs = 0; 1474 1475 PetscCall(MatSeqSELLInvalidateDiagonal(A)); 1476 #if defined(PETSC_HAVE_CUPM) 1477 if (!a->chunksize && a->totalslices) { 1478 a->chunksize = 64; 1479 while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2; 1480 totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize; 1481 } 1482 if (totalchunks != a->totalchunks) { 1483 PetscCall(PetscFree(a->chunk_slice_map)); 1484 PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map)); 1485 a->totalchunks = totalchunks; 1486 } 1487 j = 0; 1488 for (i = 0; i < totalchunks; i++) { 1489 while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++; 1490 a->chunk_slice_map[i] = j; 1491 } 1492 #endif 1493 PetscFunctionReturn(PETSC_SUCCESS); 1494 } 1495 1496 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1497 { 1498 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1499 1500 PetscFunctionBegin; 1501 info->block_size = 1.0; 1502 info->nz_allocated = a->maxallocmat; 1503 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1504 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1505 info->assemblies = A->num_ass; 1506 info->mallocs = A->info.mallocs; 1507 info->memory = 0; /* REVIEW ME */ 1508 if (A->factortype) { 1509 info->fill_ratio_given = A->info.fill_ratio_given; 1510 info->fill_ratio_needed = A->info.fill_ratio_needed; 1511 info->factor_mallocs = A->info.factor_mallocs; 1512 } else { 1513 info->fill_ratio_given = 0; 1514 info->fill_ratio_needed = 0; 1515 info->factor_mallocs = 0; 1516 } 1517 PetscFunctionReturn(PETSC_SUCCESS); 1518 } 1519 1520 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1521 { 1522 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1523 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1524 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1525 MatScalar *vp, value; 1526 #if defined(PETSC_HAVE_CUPM) 1527 PetscBool inserted = PETSC_FALSE; 1528 PetscInt mul = DEVICE_MEM_ALIGN / a->sliceheight; 1529 #endif 1530 1531 PetscFunctionBegin; 1532 for (k = 0; k < m; k++) { /* loop over added rows */ 1533 row = im[k]; 1534 if (row < 0) continue; 1535 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1536 shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */ 1537 cp = a->colidx + shift; /* pointer to the row */ 1538 vp = a->val + shift; /* pointer to the row */ 1539 nrow = a->rlen[row]; 1540 low = 0; 1541 high = nrow; 1542 1543 for (l = 0; l < n; l++) { /* loop over added columns */ 1544 col = in[l]; 1545 if (col < 0) continue; 1546 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1547 if (a->roworiented) { 1548 value = v[l + k * n]; 1549 } else { 1550 value = v[k + l * m]; 1551 } 1552 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1553 1554 /* search in this row for the specified column, i indicates the column to be set */ 1555 if (col <= lastcol) low = 0; 1556 else high = nrow; 1557 lastcol = col; 1558 while (high - low > 5) { 1559 t = (low + high) / 2; 1560 if (*(cp + a->sliceheight * t) > col) high = t; 1561 else low = t; 1562 } 1563 for (i = low; i < high; i++) { 1564 if (*(cp + a->sliceheight * i) > col) break; 1565 if (*(cp + a->sliceheight * i) == col) { 1566 if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value; 1567 else *(vp + a->sliceheight * i) = value; 1568 #if defined(PETSC_HAVE_CUPM) 1569 inserted = PETSC_TRUE; 1570 #endif 1571 low = i + 1; 1572 goto noinsert; 1573 } 1574 } 1575 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1576 if (nonew == 1) goto noinsert; 1577 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1578 #if defined(PETSC_HAVE_CUPM) 1579 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul); 1580 #else 1581 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1582 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1); 1583 #endif 1584 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1585 for (ii = nrow - 1; ii >= i; ii--) { 1586 *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii); 1587 *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii); 1588 } 1589 a->rlen[row]++; 1590 *(cp + a->sliceheight * i) = col; 1591 *(vp + a->sliceheight * i) = value; 1592 a->nz++; 1593 #if defined(PETSC_HAVE_CUPM) 1594 inserted = PETSC_TRUE; 1595 #endif 1596 low = i + 1; 1597 high++; 1598 nrow++; 1599 noinsert:; 1600 } 1601 a->rlen[row] = nrow; 1602 } 1603 #if defined(PETSC_HAVE_CUPM) 1604 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU; 1605 #endif 1606 PetscFunctionReturn(PETSC_SUCCESS); 1607 } 1608 1609 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1610 { 1611 PetscFunctionBegin; 1612 /* If the two matrices have the same copy implementation, use fast copy. */ 1613 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1614 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1615 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1616 1617 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1618 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1619 } else { 1620 PetscCall(MatCopy_Basic(A, B, str)); 1621 } 1622 PetscFunctionReturn(PETSC_SUCCESS); 1623 } 1624 1625 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1626 { 1627 PetscFunctionBegin; 1628 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1629 PetscFunctionReturn(PETSC_SUCCESS); 1630 } 1631 1632 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1633 { 1634 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1635 1636 PetscFunctionBegin; 1637 *array = a->val; 1638 PetscFunctionReturn(PETSC_SUCCESS); 1639 } 1640 1641 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1642 { 1643 PetscFunctionBegin; 1644 PetscFunctionReturn(PETSC_SUCCESS); 1645 } 1646 1647 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1648 { 1649 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1650 MatScalar *aval = a->val; 1651 PetscScalar oalpha = alpha; 1652 PetscBLASInt one = 1, size; 1653 1654 PetscFunctionBegin; 1655 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1656 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1657 PetscCall(PetscLogFlops(a->nz)); 1658 PetscCall(MatSeqSELLInvalidateDiagonal(inA)); 1659 #if defined(PETSC_HAVE_CUPM) 1660 if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU; 1661 #endif 1662 PetscFunctionReturn(PETSC_SUCCESS); 1663 } 1664 1665 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1666 { 1667 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1668 1669 PetscFunctionBegin; 1670 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1671 PetscCall(MatShift_Basic(Y, a)); 1672 PetscFunctionReturn(PETSC_SUCCESS); 1673 } 1674 1675 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1676 { 1677 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1678 PetscScalar *x, sum, *t; 1679 const MatScalar *idiag = NULL, *mdiag; 1680 const PetscScalar *b, *xb; 1681 PetscInt n, m = A->rmap->n, i, j, shift; 1682 const PetscInt *diag; 1683 1684 PetscFunctionBegin; 1685 its = its * lits; 1686 1687 if (fshift != a->fshift || omega != a->omega) a->idiagvalid = PETSC_FALSE; /* must recompute idiag[] */ 1688 if (!a->idiagvalid) PetscCall(MatInvertDiagonal_SeqSELL(A, omega, fshift)); 1689 a->fshift = fshift; 1690 a->omega = omega; 1691 1692 diag = a->diag; 1693 t = a->ssor_work; 1694 idiag = a->idiag; 1695 mdiag = a->mdiag; 1696 1697 PetscCall(VecGetArray(xx, &x)); 1698 PetscCall(VecGetArrayRead(bb, &b)); 1699 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1700 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1701 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1702 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1703 1704 if (flag & SOR_ZERO_INITIAL_GUESS) { 1705 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1706 for (i = 0; i < m; i++) { 1707 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1708 sum = b[i]; 1709 n = (diag[i] - shift) / a->sliceheight; 1710 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1711 t[i] = sum; 1712 x[i] = sum * idiag[i]; 1713 } 1714 xb = t; 1715 PetscCall(PetscLogFlops(a->nz)); 1716 } else xb = b; 1717 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1718 for (i = m - 1; i >= 0; i--) { 1719 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1720 sum = xb[i]; 1721 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1722 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1723 if (xb == b) { 1724 x[i] = sum * idiag[i]; 1725 } else { 1726 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1727 } 1728 } 1729 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1730 } 1731 its--; 1732 } 1733 while (its--) { 1734 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1735 for (i = 0; i < m; i++) { 1736 /* lower */ 1737 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1738 sum = b[i]; 1739 n = (diag[i] - shift) / a->sliceheight; 1740 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1741 t[i] = sum; /* save application of the lower-triangular part */ 1742 /* upper */ 1743 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1744 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1745 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1746 } 1747 xb = t; 1748 PetscCall(PetscLogFlops(2.0 * a->nz)); 1749 } else xb = b; 1750 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1751 for (i = m - 1; i >= 0; i--) { 1752 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1753 sum = xb[i]; 1754 if (xb == b) { 1755 /* whole matrix (no checkpointing available) */ 1756 n = a->rlen[i]; 1757 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1758 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1759 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1760 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1761 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1762 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1763 } 1764 } 1765 if (xb == b) { 1766 PetscCall(PetscLogFlops(2.0 * a->nz)); 1767 } else { 1768 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1769 } 1770 } 1771 } 1772 PetscCall(VecRestoreArray(xx, &x)); 1773 PetscCall(VecRestoreArrayRead(bb, &b)); 1774 PetscFunctionReturn(PETSC_SUCCESS); 1775 } 1776 1777 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1778 MatGetRow_SeqSELL, 1779 MatRestoreRow_SeqSELL, 1780 MatMult_SeqSELL, 1781 /* 4*/ MatMultAdd_SeqSELL, 1782 MatMultTranspose_SeqSELL, 1783 MatMultTransposeAdd_SeqSELL, 1784 NULL, 1785 NULL, 1786 NULL, 1787 /* 10*/ NULL, 1788 NULL, 1789 NULL, 1790 MatSOR_SeqSELL, 1791 NULL, 1792 /* 15*/ MatGetInfo_SeqSELL, 1793 MatEqual_SeqSELL, 1794 MatGetDiagonal_SeqSELL, 1795 MatDiagonalScale_SeqSELL, 1796 NULL, 1797 /* 20*/ NULL, 1798 MatAssemblyEnd_SeqSELL, 1799 MatSetOption_SeqSELL, 1800 MatZeroEntries_SeqSELL, 1801 /* 24*/ NULL, 1802 NULL, 1803 NULL, 1804 NULL, 1805 NULL, 1806 /* 29*/ MatSetUp_SeqSELL, 1807 NULL, 1808 NULL, 1809 NULL, 1810 NULL, 1811 /* 34*/ MatDuplicate_SeqSELL, 1812 NULL, 1813 NULL, 1814 NULL, 1815 NULL, 1816 /* 39*/ NULL, 1817 NULL, 1818 NULL, 1819 MatGetValues_SeqSELL, 1820 MatCopy_SeqSELL, 1821 /* 44*/ NULL, 1822 MatScale_SeqSELL, 1823 MatShift_SeqSELL, 1824 NULL, 1825 NULL, 1826 /* 49*/ NULL, 1827 NULL, 1828 NULL, 1829 NULL, 1830 NULL, 1831 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1832 NULL, 1833 NULL, 1834 NULL, 1835 NULL, 1836 /* 59*/ NULL, 1837 MatDestroy_SeqSELL, 1838 MatView_SeqSELL, 1839 NULL, 1840 NULL, 1841 /* 64*/ NULL, 1842 NULL, 1843 NULL, 1844 NULL, 1845 NULL, 1846 /* 69*/ NULL, 1847 NULL, 1848 NULL, 1849 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1850 NULL, 1851 /* 74*/ NULL, 1852 NULL, 1853 NULL, 1854 NULL, 1855 NULL, 1856 /* 79*/ NULL, 1857 NULL, 1858 NULL, 1859 NULL, 1860 NULL, 1861 /* 84*/ NULL, 1862 NULL, 1863 NULL, 1864 NULL, 1865 NULL, 1866 /* 89*/ NULL, 1867 NULL, 1868 NULL, 1869 NULL, 1870 MatConjugate_SeqSELL, 1871 /* 94*/ NULL, 1872 NULL, 1873 NULL, 1874 NULL, 1875 NULL, 1876 /* 99*/ NULL, 1877 NULL, 1878 NULL, 1879 NULL, 1880 NULL, 1881 /*104*/ MatMissingDiagonal_SeqSELL, 1882 NULL, 1883 NULL, 1884 NULL, 1885 NULL, 1886 /*109*/ NULL, 1887 NULL, 1888 NULL, 1889 NULL, 1890 NULL, 1891 /*114*/ NULL, 1892 NULL, 1893 NULL, 1894 NULL, 1895 NULL, 1896 /*119*/ NULL, 1897 NULL, 1898 NULL, 1899 NULL, 1900 NULL, 1901 /*124*/ NULL, 1902 NULL, 1903 NULL, 1904 NULL, 1905 NULL, 1906 /*129*/ MatFDColoringSetUp_SeqXAIJ, 1907 NULL, 1908 NULL, 1909 NULL, 1910 NULL, 1911 /*134*/ NULL, 1912 NULL, 1913 NULL, 1914 NULL, 1915 NULL, 1916 /*139*/ NULL, 1917 NULL, 1918 NULL, 1919 NULL, 1920 NULL}; 1921 1922 static PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1923 { 1924 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1925 1926 PetscFunctionBegin; 1927 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1928 1929 /* allocate space for values if not already there */ 1930 if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); 1931 1932 /* copy values over */ 1933 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1934 PetscFunctionReturn(PETSC_SUCCESS); 1935 } 1936 1937 static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1938 { 1939 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1940 1941 PetscFunctionBegin; 1942 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1943 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1944 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1945 PetscFunctionReturn(PETSC_SUCCESS); 1946 } 1947 1948 static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio) 1949 { 1950 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1951 1952 PetscFunctionBegin; 1953 if (a->totalslices && a->sliidx[a->totalslices]) { 1954 *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices]; 1955 } else { 1956 *ratio = 0.0; 1957 } 1958 PetscFunctionReturn(PETSC_SUCCESS); 1959 } 1960 1961 static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth) 1962 { 1963 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1964 PetscInt i, current_slicewidth; 1965 1966 PetscFunctionBegin; 1967 *slicewidth = 0; 1968 for (i = 0; i < a->totalslices; i++) { 1969 current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight; 1970 if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth; 1971 } 1972 PetscFunctionReturn(PETSC_SUCCESS); 1973 } 1974 1975 static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth) 1976 { 1977 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1978 1979 PetscFunctionBegin; 1980 *slicewidth = 0; 1981 if (a->totalslices) *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; 1982 PetscFunctionReturn(PETSC_SUCCESS); 1983 } 1984 1985 static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance) 1986 { 1987 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1988 PetscReal mean; 1989 PetscInt i, totalslices = a->totalslices, *sliidx = a->sliidx; 1990 1991 PetscFunctionBegin; 1992 *variance = 0; 1993 if (totalslices) { 1994 mean = (PetscReal)sliidx[totalslices] / totalslices; 1995 for (i = 1; i <= totalslices; i++) *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; 1996 } 1997 PetscFunctionReturn(PETSC_SUCCESS); 1998 } 1999 2000 static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight) 2001 { 2002 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2003 2004 PetscFunctionBegin; 2005 if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS); 2006 PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight); 2007 a->sliceheight = sliceheight; 2008 #if defined(PETSC_HAVE_CUPM) 2009 PetscCheck(PetscMax(DEVICE_MEM_ALIGN, sliceheight) % PetscMin(DEVICE_MEM_ALIGN, sliceheight) == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The slice height is not compatible with DEVICE_MEM_ALIGN (one must be divisible by the other) %" PetscInt_FMT, sliceheight); 2010 #endif 2011 PetscFunctionReturn(PETSC_SUCCESS); 2012 } 2013 2014 /*@ 2015 MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix. 2016 2017 Not Collective 2018 2019 Input Parameter: 2020 . A - a MATSEQSELL matrix 2021 2022 Output Parameter: 2023 . ratio - ratio of number of padded zeros to number of allocated elements 2024 2025 Level: intermediate 2026 2027 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2028 @*/ 2029 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio) 2030 { 2031 PetscFunctionBegin; 2032 PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio)); 2033 PetscFunctionReturn(PETSC_SUCCESS); 2034 } 2035 2036 /*@ 2037 MatSeqSELLGetMaxSliceWidth - returns the maximum slice width. 2038 2039 Not Collective 2040 2041 Input Parameter: 2042 . A - a MATSEQSELL matrix 2043 2044 Output Parameter: 2045 . slicewidth - maximum slice width 2046 2047 Level: intermediate 2048 2049 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2050 @*/ 2051 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth) 2052 { 2053 PetscFunctionBegin; 2054 PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth)); 2055 PetscFunctionReturn(PETSC_SUCCESS); 2056 } 2057 2058 /*@ 2059 MatSeqSELLGetAvgSliceWidth - returns the average slice width. 2060 2061 Not Collective 2062 2063 Input Parameter: 2064 . A - a MATSEQSELL matrix 2065 2066 Output Parameter: 2067 . slicewidth - average slice width 2068 2069 Level: intermediate 2070 2071 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()` 2072 @*/ 2073 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth) 2074 { 2075 PetscFunctionBegin; 2076 PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth)); 2077 PetscFunctionReturn(PETSC_SUCCESS); 2078 } 2079 2080 /*@ 2081 MatSeqSELLSetSliceHeight - sets the slice height. 2082 2083 Not Collective 2084 2085 Input Parameters: 2086 + A - a MATSEQSELL matrix 2087 - sliceheight - slice height 2088 2089 Notes: 2090 You cannot change the slice height once it have been set. 2091 2092 The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called. 2093 2094 Level: intermediate 2095 2096 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()` 2097 @*/ 2098 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight) 2099 { 2100 PetscFunctionBegin; 2101 PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight)); 2102 PetscFunctionReturn(PETSC_SUCCESS); 2103 } 2104 2105 /*@ 2106 MatSeqSELLGetVarSliceSize - returns the variance of the slice size. 2107 2108 Not Collective 2109 2110 Input Parameter: 2111 . A - a MATSEQSELL matrix 2112 2113 Output Parameter: 2114 . variance - variance of the slice size 2115 2116 Level: intermediate 2117 2118 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()` 2119 @*/ 2120 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance) 2121 { 2122 PetscFunctionBegin; 2123 PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance)); 2124 PetscFunctionReturn(PETSC_SUCCESS); 2125 } 2126 2127 #if defined(PETSC_HAVE_CUDA) 2128 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 2129 #endif 2130 #if defined(PETSC_HAVE_HIP) 2131 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat); 2132 #endif 2133 2134 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 2135 { 2136 Mat_SeqSELL *b; 2137 PetscMPIInt size; 2138 2139 PetscFunctionBegin; 2140 PetscCall(PetscCitationsRegister(citation, &cited)); 2141 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 2142 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 2143 2144 PetscCall(PetscNew(&b)); 2145 2146 B->data = (void *)b; 2147 B->ops[0] = MatOps_Values; 2148 2149 b->row = NULL; 2150 b->col = NULL; 2151 b->icol = NULL; 2152 b->reallocs = 0; 2153 b->ignorezeroentries = PETSC_FALSE; 2154 b->roworiented = PETSC_TRUE; 2155 b->nonew = 0; 2156 b->diag = NULL; 2157 b->solve_work = NULL; 2158 B->spptr = NULL; 2159 b->saved_values = NULL; 2160 b->idiag = NULL; 2161 b->mdiag = NULL; 2162 b->ssor_work = NULL; 2163 b->omega = 1.0; 2164 b->fshift = 0.0; 2165 b->idiagvalid = PETSC_FALSE; 2166 b->keepnonzeropattern = PETSC_FALSE; 2167 b->sliceheight = 0; 2168 2169 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 2170 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 2171 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 2172 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 2173 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 2174 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 2175 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 2176 #if defined(PETSC_HAVE_CUDA) 2177 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA)); 2178 #endif 2179 #if defined(PETSC_HAVE_HIP) 2180 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellhip_C", MatConvert_SeqSELL_SeqSELLHIP)); 2181 #endif 2182 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL)); 2183 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL)); 2184 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL)); 2185 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL)); 2186 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL)); 2187 2188 PetscObjectOptionsBegin((PetscObject)B); 2189 { 2190 PetscInt newsh = -1; 2191 PetscBool flg; 2192 #if defined(PETSC_HAVE_CUPM) 2193 PetscInt chunksize = 0; 2194 #endif 2195 2196 PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg)); 2197 if (flg) PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); 2198 #if defined(PETSC_HAVE_CUPM) 2199 PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA/HIP kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg)); 2200 if (flg) { 2201 PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize); 2202 b->chunksize = chunksize; 2203 } 2204 #endif 2205 } 2206 PetscOptionsEnd(); 2207 PetscFunctionReturn(PETSC_SUCCESS); 2208 } 2209 2210 /* 2211 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 2212 */ 2213 static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 2214 { 2215 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 2216 PetscInt i, m = A->rmap->n; 2217 PetscInt totalslices = a->totalslices; 2218 2219 PetscFunctionBegin; 2220 C->factortype = A->factortype; 2221 c->row = NULL; 2222 c->col = NULL; 2223 c->icol = NULL; 2224 c->reallocs = 0; 2225 C->assembled = PETSC_TRUE; 2226 2227 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 2228 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 2229 2230 c->sliceheight = a->sliceheight; 2231 PetscCall(PetscMalloc1(c->sliceheight * totalslices, &c->rlen)); 2232 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2233 2234 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2235 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2236 2237 /* allocate the matrix space */ 2238 if (mallocmatspace) { 2239 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2240 2241 c->singlemalloc = PETSC_TRUE; 2242 2243 if (m > 0) { 2244 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2245 if (cpvalues == MAT_COPY_VALUES) { 2246 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2247 } else { 2248 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2249 } 2250 } 2251 } 2252 2253 c->ignorezeroentries = a->ignorezeroentries; 2254 c->roworiented = a->roworiented; 2255 c->nonew = a->nonew; 2256 if (a->diag) { 2257 PetscCall(PetscMalloc1(m, &c->diag)); 2258 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2259 } else c->diag = NULL; 2260 2261 c->solve_work = NULL; 2262 c->saved_values = NULL; 2263 c->idiag = NULL; 2264 c->ssor_work = NULL; 2265 c->keepnonzeropattern = a->keepnonzeropattern; 2266 c->free_val = PETSC_TRUE; 2267 c->free_colidx = PETSC_TRUE; 2268 2269 c->maxallocmat = a->maxallocmat; 2270 c->maxallocrow = a->maxallocrow; 2271 c->rlenmax = a->rlenmax; 2272 c->nz = a->nz; 2273 C->preallocated = PETSC_TRUE; 2274 2275 c->nonzerorowcnt = a->nonzerorowcnt; 2276 C->nonzerostate = A->nonzerostate; 2277 2278 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2279 PetscFunctionReturn(PETSC_SUCCESS); 2280 } 2281 2282 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2283 { 2284 PetscFunctionBegin; 2285 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2286 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2287 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2288 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2289 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2290 PetscFunctionReturn(PETSC_SUCCESS); 2291 } 2292 2293 /*MC 2294 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2295 based on the sliced Ellpack format, {cite}`zhangellpack2018` 2296 2297 Options Database Key: 2298 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2299 2300 Level: beginner 2301 2302 .seealso: `Mat`, `MatCreateSeqSELL()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2303 M*/ 2304 2305 /*MC 2306 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices, {cite}`zhangellpack2018` 2307 2308 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2309 and `MATMPISELL` otherwise. As a result, for single process communicators, 2310 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2311 for communicators controlling multiple processes. It is recommended that you call both of 2312 the above preallocation routines for simplicity. 2313 2314 Options Database Key: 2315 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2316 2317 Level: beginner 2318 2319 Notes: 2320 This format is only supported for real scalars, double precision, and 32-bit indices (the defaults). 2321 2322 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2323 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2324 2325 Developer Notes: 2326 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2327 2328 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2329 .vb 2330 (2 0 3 4) 2331 Consider the matrix A = (5 0 6 0) 2332 (0 0 7 8) 2333 (0 0 9 9) 2334 2335 symbolically the Ellpack format can be written as 2336 2337 (2 3 4 |) (0 2 3 |) 2338 v = (5 6 0 |) colidx = (0 2 2 |) 2339 -------- --------- 2340 (7 8 |) (2 3 |) 2341 (9 9 |) (2 3 |) 2342 2343 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2344 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2345 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2346 2347 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2348 2349 .ve 2350 2351 See `MatMult_SeqSELL()` for how this format is used with the SIMD operations to achieve high performance. 2352 2353 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSELL()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2354 M*/ 2355 2356 /*@ 2357 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2358 2359 Collective 2360 2361 Input Parameters: 2362 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2363 . m - number of rows 2364 . n - number of columns 2365 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided 2366 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL 2367 2368 Output Parameter: 2369 . A - the matrix 2370 2371 Level: intermediate 2372 2373 Notes: 2374 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2375 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2376 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2377 2378 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 2379 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 2380 allocation. 2381 2382 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL` 2383 @*/ 2384 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A) 2385 { 2386 PetscFunctionBegin; 2387 PetscCall(MatCreate(comm, A)); 2388 PetscCall(MatSetSizes(*A, m, n, m, n)); 2389 PetscCall(MatSetType(*A, MATSEQSELL)); 2390 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen)); 2391 PetscFunctionReturn(PETSC_SUCCESS); 2392 } 2393 2394 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2395 { 2396 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2397 PetscInt totalslices = a->totalslices; 2398 2399 PetscFunctionBegin; 2400 /* If the matrix dimensions are not equal,or no of nonzeros */ 2401 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2402 *flg = PETSC_FALSE; 2403 PetscFunctionReturn(PETSC_SUCCESS); 2404 } 2405 /* if the a->colidx are the same */ 2406 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2407 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 2408 /* if a->val are the same */ 2409 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2410 PetscFunctionReturn(PETSC_SUCCESS); 2411 } 2412 2413 PetscErrorCode MatSeqSELLInvalidateDiagonal(Mat A) 2414 { 2415 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2416 2417 PetscFunctionBegin; 2418 a->idiagvalid = PETSC_FALSE; 2419 PetscFunctionReturn(PETSC_SUCCESS); 2420 } 2421 2422 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2423 { 2424 #if defined(PETSC_USE_COMPLEX) 2425 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2426 PetscInt i; 2427 PetscScalar *val = a->val; 2428 2429 PetscFunctionBegin; 2430 for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]); 2431 #if defined(PETSC_HAVE_CUPM) 2432 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 2433 #endif 2434 #else 2435 PetscFunctionBegin; 2436 #endif 2437 PetscFunctionReturn(PETSC_SUCCESS); 2438 } 2439