1 /* 2 Defines the basic matrix operations for the SELL matrix storage format. 3 */ 4 #include <../src/mat/impls/sell/seq/sell.h> /*I "petscmat.h" I*/ 5 #include <petscblaslapack.h> 6 #include <petsc/private/kernels/blocktranspose.h> 7 8 static PetscBool cited = PETSC_FALSE; 9 static const char citation[] = "@inproceedings{ZhangELLPACK2018,\n" 10 " author = {Hong Zhang and Richard T. Mills and Karl Rupp and Barry F. Smith},\n" 11 " title = {Vectorized Parallel Sparse Matrix-Vector Multiplication in {PETSc} Using {AVX-512}},\n" 12 " booktitle = {Proceedings of the 47th International Conference on Parallel Processing},\n" 13 " year = 2018\n" 14 "}\n"; 15 16 #if defined(PETSC_HAVE_IMMINTRIN_H) && (defined(__AVX512F__) || (defined(__AVX2__) && defined(__FMA__)) || defined(__AVX__)) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 17 18 #include <immintrin.h> 19 20 #if !defined(_MM_SCALE_8) 21 #define _MM_SCALE_8 8 22 #endif 23 24 #if defined(__AVX512F__) 25 /* these do not work 26 vec_idx = _mm512_loadunpackhi_epi32(vec_idx,acolidx); 27 vec_vals = _mm512_loadunpackhi_pd(vec_vals,aval); 28 */ 29 #define AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 30 /* if the mask bit is set, copy from acolidx, otherwise from vec_idx */ \ 31 vec_idx = _mm256_loadu_si256((__m256i const *)acolidx); \ 32 vec_vals = _mm512_loadu_pd(aval); \ 33 vec_x = _mm512_i32gather_pd(vec_idx, x, _MM_SCALE_8); \ 34 vec_y = _mm512_fmadd_pd(vec_x, vec_vals, vec_y) 35 #elif defined(__AVX2__) && defined(__FMA__) 36 #define AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y) \ 37 vec_vals = _mm256_loadu_pd(aval); \ 38 vec_idx = _mm_loadu_si128((__m128i const *)acolidx); /* SSE2 */ \ 39 vec_x = _mm256_i32gather_pd(x, vec_idx, _MM_SCALE_8); \ 40 vec_y = _mm256_fmadd_pd(vec_x, vec_vals, vec_y) 41 #endif 42 #endif /* PETSC_HAVE_IMMINTRIN_H */ 43 44 /*@ 45 MatSeqSELLSetPreallocation - For good matrix assembly performance 46 the user should preallocate the matrix storage by setting the parameter `nz` 47 (or the array `nnz`). 48 49 Collective 50 51 Input Parameters: 52 + B - The `MATSEQSELL` matrix 53 . rlenmax - number of nonzeros per row (same for all rows), ignored if `rlen` is provided 54 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 55 56 Level: intermediate 57 58 Notes: 59 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 60 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 61 allocation. 62 63 You can call `MatGetInfo()` to get information on how effective the preallocation was; 64 for example the fields mallocs,nz_allocated,nz_used,nz_unneeded; 65 You can also run with the option `-info` and look for messages with the string 66 malloc in them to see if additional memory allocation was needed. 67 68 Developer Notes: 69 Use `rlenmax` of `MAT_SKIP_ALLOCATION` to not allocate any space for the matrix 70 entries or columns indices. 71 72 The maximum number of nonzeos in any row should be as accurate as possible. 73 If it is underestimated, you will get bad performance due to reallocation 74 (`MatSeqXSELLReallocateSELL()`). 75 76 .seealso: `Mat`, `MATSEQSELL`, `MATSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatGetInfo()` 77 @*/ 78 PetscErrorCode MatSeqSELLSetPreallocation(Mat B, PetscInt rlenmax, const PetscInt rlen[]) 79 { 80 PetscFunctionBegin; 81 PetscValidHeaderSpecific(B, MAT_CLASSID, 1); 82 PetscValidType(B, 1); 83 PetscTryMethod(B, "MatSeqSELLSetPreallocation_C", (Mat, PetscInt, const PetscInt[]), (B, rlenmax, rlen)); 84 PetscFunctionReturn(PETSC_SUCCESS); 85 } 86 87 PetscErrorCode MatSeqSELLSetPreallocation_SeqSELL(Mat B, PetscInt maxallocrow, const PetscInt rlen[]) 88 { 89 Mat_SeqSELL *b; 90 PetscInt i, j, totalslices; 91 #if defined(PETSC_HAVE_CUPM) 92 PetscInt rlenmax = 0; 93 #endif 94 PetscBool skipallocation = PETSC_FALSE, realalloc = PETSC_FALSE; 95 96 PetscFunctionBegin; 97 if (maxallocrow >= 0 || rlen) realalloc = PETSC_TRUE; 98 if (maxallocrow == MAT_SKIP_ALLOCATION) { 99 skipallocation = PETSC_TRUE; 100 maxallocrow = 0; 101 } 102 103 PetscCall(PetscLayoutSetUp(B->rmap)); 104 PetscCall(PetscLayoutSetUp(B->cmap)); 105 106 /* FIXME: if one preallocates more space than needed, the matrix does not shrink automatically, but for best performance it should */ 107 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 5; 108 PetscCheck(maxallocrow >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "maxallocrow cannot be less than 0: value %" PetscInt_FMT, maxallocrow); 109 if (rlen) { 110 for (i = 0; i < B->rmap->n; i++) { 111 PetscCheck(rlen[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, rlen[i]); 112 PetscCheck(rlen[i] <= B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "rlen cannot be greater than row length: local row %" PetscInt_FMT " value %" PetscInt_FMT " rowlength %" PetscInt_FMT, i, rlen[i], B->cmap->n); 113 } 114 } 115 116 B->preallocated = PETSC_TRUE; 117 118 b = (Mat_SeqSELL *)B->data; 119 120 if (!b->sliceheight) { /* not set yet */ 121 #if defined(PETSC_HAVE_CUPM) 122 b->sliceheight = 16; 123 #else 124 b->sliceheight = 8; 125 #endif 126 } 127 totalslices = PetscCeilInt(B->rmap->n, b->sliceheight); 128 b->totalslices = totalslices; 129 if (!skipallocation) { 130 if (B->rmap->n % b->sliceheight) PetscCall(PetscInfo(B, "Padding rows to the SEQSELL matrix because the number of rows is not the multiple of the slice height (value %" PetscInt_FMT ")\n", B->rmap->n)); 131 132 if (!b->sliidx) { /* sliidx gives the starting index of each slice, the last element is the total space allocated */ 133 PetscCall(PetscMalloc1(totalslices + 1, &b->sliidx)); 134 } 135 if (!rlen) { /* if rlen is not provided, allocate same space for all the slices */ 136 if (maxallocrow == PETSC_DEFAULT || maxallocrow == PETSC_DECIDE) maxallocrow = 10; 137 else if (maxallocrow < 0) maxallocrow = 1; 138 #if defined(PETSC_HAVE_CUPM) 139 rlenmax = maxallocrow; 140 /* Pad the slice to DEVICE_MEM_ALIGN */ 141 while (b->sliceheight * maxallocrow % DEVICE_MEM_ALIGN) maxallocrow++; 142 #endif 143 for (i = 0; i <= totalslices; i++) b->sliidx[i] = b->sliceheight * i * maxallocrow; 144 } else { 145 #if defined(PETSC_HAVE_CUPM) 146 PetscInt mul = DEVICE_MEM_ALIGN / b->sliceheight; 147 #endif 148 maxallocrow = 0; 149 b->sliidx[0] = 0; 150 for (i = 1; i < totalslices; i++) { 151 b->sliidx[i] = 0; 152 for (j = 0; j < b->sliceheight; j++) b->sliidx[i] = PetscMax(b->sliidx[i], rlen[b->sliceheight * (i - 1) + j]); 153 #if defined(PETSC_HAVE_CUPM) 154 if (mul != 0) { /* Pad the slice to DEVICE_MEM_ALIGN if sliceheight < DEVICE_MEM_ALIGN */ 155 rlenmax = PetscMax(b->sliidx[i], rlenmax); 156 b->sliidx[i] = ((b->sliidx[i] - 1) / mul + 1) * mul; 157 } 158 #endif 159 maxallocrow = PetscMax(b->sliidx[i], maxallocrow); 160 PetscCall(PetscIntSumError(b->sliidx[i - 1], b->sliceheight * b->sliidx[i], &b->sliidx[i])); 161 } 162 /* last slice */ 163 b->sliidx[totalslices] = 0; 164 for (j = b->sliceheight * (totalslices - 1); j < B->rmap->n; j++) b->sliidx[totalslices] = PetscMax(b->sliidx[totalslices], rlen[j]); 165 #if defined(PETSC_HAVE_CUPM) 166 if (mul != 0) { 167 rlenmax = PetscMax(b->sliidx[i], rlenmax); 168 b->sliidx[totalslices] = ((b->sliidx[totalslices] - 1) / mul + 1) * mul; 169 } 170 #endif 171 maxallocrow = PetscMax(b->sliidx[totalslices], maxallocrow); 172 b->sliidx[totalslices] = b->sliidx[totalslices - 1] + b->sliceheight * b->sliidx[totalslices]; 173 } 174 175 /* allocate space for val, colidx, rlen */ 176 /* FIXME: should B's old memory be unlogged? */ 177 PetscCall(MatSeqXSELLFreeSELL(B, &b->val, &b->colidx)); 178 /* FIXME: assuming an element of the bit array takes 8 bits */ 179 PetscCall(PetscMalloc2(b->sliidx[totalslices], &b->val, b->sliidx[totalslices], &b->colidx)); 180 /* b->rlen will count nonzeros in each row so far. We dont copy rlen to b->rlen because the matrix has not been set. */ 181 PetscCall(PetscCalloc1(b->sliceheight * totalslices, &b->rlen)); 182 183 b->singlemalloc = PETSC_TRUE; 184 b->free_val = PETSC_TRUE; 185 b->free_colidx = PETSC_TRUE; 186 } else { 187 b->free_val = PETSC_FALSE; 188 b->free_colidx = PETSC_FALSE; 189 } 190 191 b->nz = 0; 192 b->maxallocrow = maxallocrow; 193 #if defined(PETSC_HAVE_CUPM) 194 b->rlenmax = rlenmax; 195 #else 196 b->rlenmax = maxallocrow; 197 #endif 198 b->maxallocmat = b->sliidx[totalslices]; 199 B->info.nz_unneeded = (double)b->maxallocmat; 200 if (realalloc) PetscCall(MatSetOption(B, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 201 PetscFunctionReturn(PETSC_SUCCESS); 202 } 203 204 static PetscErrorCode MatGetRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 205 { 206 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 207 PetscInt shift; 208 209 PetscFunctionBegin; 210 PetscCheck(row >= 0 && row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row %" PetscInt_FMT " out of range", row); 211 if (nz) *nz = a->rlen[row]; 212 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); 213 if (!a->getrowcols) PetscCall(PetscMalloc2(a->rlenmax, &a->getrowcols, a->rlenmax, &a->getrowvals)); 214 if (idx) { 215 PetscInt j; 216 for (j = 0; j < a->rlen[row]; j++) a->getrowcols[j] = a->colidx[shift + a->sliceheight * j]; 217 *idx = a->getrowcols; 218 } 219 if (v) { 220 PetscInt j; 221 for (j = 0; j < a->rlen[row]; j++) a->getrowvals[j] = a->val[shift + a->sliceheight * j]; 222 *v = a->getrowvals; 223 } 224 PetscFunctionReturn(PETSC_SUCCESS); 225 } 226 227 static PetscErrorCode MatRestoreRow_SeqSELL(Mat A, PetscInt row, PetscInt *nz, PetscInt **idx, PetscScalar **v) 228 { 229 PetscFunctionBegin; 230 PetscFunctionReturn(PETSC_SUCCESS); 231 } 232 233 PetscErrorCode MatConvert_SeqSELL_SeqAIJ(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 234 { 235 Mat B; 236 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 237 PetscInt i; 238 239 PetscFunctionBegin; 240 if (reuse == MAT_REUSE_MATRIX) { 241 B = *newmat; 242 PetscCall(MatZeroEntries(B)); 243 } else { 244 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 245 PetscCall(MatSetSizes(B, A->rmap->n, A->cmap->n, A->rmap->N, A->cmap->N)); 246 PetscCall(MatSetType(B, MATSEQAIJ)); 247 PetscCall(MatSeqAIJSetPreallocation(B, 0, a->rlen)); 248 } 249 250 for (i = 0; i < A->rmap->n; i++) { 251 PetscInt nz = 0, *cols = NULL; 252 PetscScalar *vals = NULL; 253 254 PetscCall(MatGetRow_SeqSELL(A, i, &nz, &cols, &vals)); 255 PetscCall(MatSetValues(B, 1, &i, nz, cols, vals, INSERT_VALUES)); 256 PetscCall(MatRestoreRow_SeqSELL(A, i, &nz, &cols, &vals)); 257 } 258 259 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 260 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 261 B->rmap->bs = A->rmap->bs; 262 263 if (reuse == MAT_INPLACE_MATRIX) { 264 PetscCall(MatHeaderReplace(A, &B)); 265 } else { 266 *newmat = B; 267 } 268 PetscFunctionReturn(PETSC_SUCCESS); 269 } 270 271 #include <../src/mat/impls/aij/seq/aij.h> 272 273 PetscErrorCode MatConvert_SeqAIJ_SeqSELL(Mat A, MatType newtype, MatReuse reuse, Mat *newmat) 274 { 275 Mat B; 276 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 277 PetscInt *ai = a->i, m = A->rmap->N, n = A->cmap->N, i, *rowlengths, row, ncols; 278 const PetscInt *cols; 279 const PetscScalar *vals; 280 281 PetscFunctionBegin; 282 if (reuse == MAT_REUSE_MATRIX) { 283 B = *newmat; 284 } else { 285 if (PetscDefined(USE_DEBUG) || !a->ilen) { 286 PetscCall(PetscMalloc1(m, &rowlengths)); 287 for (i = 0; i < m; i++) rowlengths[i] = ai[i + 1] - ai[i]; 288 } 289 if (PetscDefined(USE_DEBUG) && a->ilen) { 290 PetscBool eq; 291 PetscCall(PetscMemcmp(rowlengths, a->ilen, m * sizeof(PetscInt), &eq)); 292 PetscCheck(eq, PETSC_COMM_SELF, PETSC_ERR_PLIB, "SeqAIJ ilen array incorrect"); 293 PetscCall(PetscFree(rowlengths)); 294 rowlengths = a->ilen; 295 } else if (a->ilen) rowlengths = a->ilen; 296 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), &B)); 297 PetscCall(MatSetSizes(B, m, n, m, n)); 298 PetscCall(MatSetType(B, MATSEQSELL)); 299 PetscCall(MatSeqSELLSetPreallocation(B, 0, rowlengths)); 300 if (rowlengths != a->ilen) PetscCall(PetscFree(rowlengths)); 301 } 302 303 for (row = 0; row < m; row++) { 304 PetscCall(MatGetRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 305 PetscCall(MatSetValues_SeqSELL(B, 1, &row, ncols, cols, vals, INSERT_VALUES)); 306 PetscCall(MatRestoreRow_SeqAIJ(A, row, &ncols, (PetscInt **)&cols, (PetscScalar **)&vals)); 307 } 308 PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 309 PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 310 B->rmap->bs = A->rmap->bs; 311 312 if (reuse == MAT_INPLACE_MATRIX) { 313 PetscCall(MatHeaderReplace(A, &B)); 314 } else { 315 *newmat = B; 316 } 317 PetscFunctionReturn(PETSC_SUCCESS); 318 } 319 320 PetscErrorCode MatMult_SeqSELL(Mat A, Vec xx, Vec yy) 321 { 322 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 323 PetscScalar *y; 324 const PetscScalar *x; 325 const MatScalar *aval = a->val; 326 PetscInt totalslices = a->totalslices; 327 const PetscInt *acolidx = a->colidx; 328 PetscInt i, j; 329 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 330 __m512d vec_x, vec_y, vec_vals; 331 __m256i vec_idx; 332 __mmask8 mask; 333 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 334 __m256i vec_idx2, vec_idx3, vec_idx4; 335 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 336 __m128i vec_idx; 337 __m256d vec_x, vec_y, vec_y2, vec_vals; 338 MatScalar yval; 339 PetscInt r, rows_left, row, nnz_in_row; 340 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 341 __m128d vec_x_tmp; 342 __m256d vec_x, vec_y, vec_y2, vec_vals; 343 MatScalar yval; 344 PetscInt r, rows_left, row, nnz_in_row; 345 #else 346 PetscInt k, sliceheight = a->sliceheight; 347 PetscScalar *sum; 348 #endif 349 350 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 351 #pragma disjoint(*x, *y, *aval) 352 #endif 353 354 PetscFunctionBegin; 355 PetscCall(VecGetArrayRead(xx, &x)); 356 PetscCall(VecGetArray(yy, &y)); 357 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 358 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 359 for (i = 0; i < totalslices; i++) { /* loop over slices */ 360 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 361 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 362 363 vec_y = _mm512_setzero_pd(); 364 vec_y2 = _mm512_setzero_pd(); 365 vec_y3 = _mm512_setzero_pd(); 366 vec_y4 = _mm512_setzero_pd(); 367 368 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 369 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 370 case 3: 371 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 372 acolidx += 8; 373 aval += 8; 374 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 375 acolidx += 8; 376 aval += 8; 377 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 378 acolidx += 8; 379 aval += 8; 380 j += 3; 381 break; 382 case 2: 383 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 384 acolidx += 8; 385 aval += 8; 386 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 387 acolidx += 8; 388 aval += 8; 389 j += 2; 390 break; 391 case 1: 392 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 393 acolidx += 8; 394 aval += 8; 395 j += 1; 396 break; 397 } 398 #pragma novector 399 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 400 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 401 acolidx += 8; 402 aval += 8; 403 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 404 acolidx += 8; 405 aval += 8; 406 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 407 acolidx += 8; 408 aval += 8; 409 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 410 acolidx += 8; 411 aval += 8; 412 } 413 414 vec_y = _mm512_add_pd(vec_y, vec_y2); 415 vec_y = _mm512_add_pd(vec_y, vec_y3); 416 vec_y = _mm512_add_pd(vec_y, vec_y4); 417 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 418 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 419 _mm512_mask_storeu_pd(&y[8 * i], mask, vec_y); 420 } else { 421 _mm512_storeu_pd(&y[8 * i], vec_y); 422 } 423 } 424 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 425 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 426 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 427 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 428 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 429 430 /* last slice may have padding rows. Don't use vectorization. */ 431 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 432 rows_left = A->rmap->n - 8 * i; 433 for (r = 0; r < rows_left; ++r) { 434 yval = (MatScalar)0; 435 row = 8 * i + r; 436 nnz_in_row = a->rlen[row]; 437 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 438 y[row] = yval; 439 } 440 break; 441 } 442 443 vec_y = _mm256_setzero_pd(); 444 vec_y2 = _mm256_setzero_pd(); 445 446 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 447 #pragma novector 448 #pragma unroll(2) 449 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 450 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 451 aval += 4; 452 acolidx += 4; 453 AVX2_Mult_Private(vec_idx, vec_x, vec_vals, vec_y2); 454 aval += 4; 455 acolidx += 4; 456 } 457 458 _mm256_storeu_pd(y + i * 8, vec_y); 459 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 460 } 461 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 462 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 463 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 464 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 465 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 466 467 vec_y = _mm256_setzero_pd(); 468 vec_y2 = _mm256_setzero_pd(); 469 470 /* last slice may have padding rows. Don't use vectorization. */ 471 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 472 rows_left = A->rmap->n - 8 * i; 473 for (r = 0; r < rows_left; ++r) { 474 yval = (MatScalar)0; 475 row = 8 * i + r; 476 nnz_in_row = a->rlen[row]; 477 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 478 y[row] = yval; 479 } 480 break; 481 } 482 483 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 484 #pragma novector 485 #pragma unroll(2) 486 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 487 vec_vals = _mm256_loadu_pd(aval); 488 vec_x_tmp = _mm_setzero_pd(); 489 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 490 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 491 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 492 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 493 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 494 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 495 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 496 aval += 4; 497 498 vec_vals = _mm256_loadu_pd(aval); 499 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 500 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 501 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 502 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 503 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 504 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 505 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 506 aval += 4; 507 } 508 509 _mm256_storeu_pd(y + i * 8, vec_y); 510 _mm256_storeu_pd(y + i * 8 + 4, vec_y2); 511 } 512 #else 513 PetscCall(PetscMalloc1(sliceheight, &sum)); 514 for (i = 0; i < totalslices; i++) { /* loop over slices */ 515 for (j = 0; j < sliceheight; j++) { 516 sum[j] = 0.0; 517 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 518 } 519 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { /* if last slice has padding rows */ 520 for (j = 0; j < (A->rmap->n % sliceheight); j++) y[sliceheight * i + j] = sum[j]; 521 } else { 522 for (j = 0; j < sliceheight; j++) y[sliceheight * i + j] = sum[j]; 523 } 524 } 525 PetscCall(PetscFree(sum)); 526 #endif 527 528 PetscCall(PetscLogFlops(2.0 * a->nz - a->nonzerorowcnt)); /* theoretical minimal FLOPs */ 529 PetscCall(VecRestoreArrayRead(xx, &x)); 530 PetscCall(VecRestoreArray(yy, &y)); 531 PetscFunctionReturn(PETSC_SUCCESS); 532 } 533 534 #include <../src/mat/impls/aij/seq/ftn-kernels/fmultadd.h> 535 PetscErrorCode MatMultAdd_SeqSELL(Mat A, Vec xx, Vec yy, Vec zz) 536 { 537 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 538 PetscScalar *y, *z; 539 const PetscScalar *x; 540 const MatScalar *aval = a->val; 541 PetscInt totalslices = a->totalslices; 542 const PetscInt *acolidx = a->colidx; 543 PetscInt i, j; 544 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 545 __m512d vec_x, vec_y, vec_vals; 546 __m256i vec_idx; 547 __mmask8 mask = 0; 548 __m512d vec_x2, vec_y2, vec_vals2, vec_x3, vec_y3, vec_vals3, vec_x4, vec_y4, vec_vals4; 549 __m256i vec_idx2, vec_idx3, vec_idx4; 550 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 551 __m128d vec_x_tmp; 552 __m256d vec_x, vec_y, vec_y2, vec_vals; 553 MatScalar yval; 554 PetscInt r, row, nnz_in_row; 555 #else 556 PetscInt k, sliceheight = a->sliceheight; 557 PetscScalar *sum; 558 #endif 559 560 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 561 #pragma disjoint(*x, *y, *aval) 562 #endif 563 564 PetscFunctionBegin; 565 if (!a->nz) { 566 PetscCall(VecCopy(yy, zz)); 567 PetscFunctionReturn(PETSC_SUCCESS); 568 } 569 PetscCall(VecGetArrayRead(xx, &x)); 570 PetscCall(VecGetArrayPair(yy, zz, &y, &z)); 571 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 572 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 573 for (i = 0; i < totalslices; i++) { /* loop over slices */ 574 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 575 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 576 577 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 578 mask = (__mmask8)(0xff >> (8 - (A->rmap->n & 0x07))); 579 vec_y = _mm512_mask_loadu_pd(vec_y, mask, &y[8 * i]); 580 } else { 581 vec_y = _mm512_loadu_pd(&y[8 * i]); 582 } 583 vec_y2 = _mm512_setzero_pd(); 584 vec_y3 = _mm512_setzero_pd(); 585 vec_y4 = _mm512_setzero_pd(); 586 587 j = a->sliidx[i] >> 3; /* 8 bytes are read at each time, corresponding to a slice column */ 588 switch ((a->sliidx[i + 1] - a->sliidx[i]) / 8 & 3) { 589 case 3: 590 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 591 acolidx += 8; 592 aval += 8; 593 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 594 acolidx += 8; 595 aval += 8; 596 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 597 acolidx += 8; 598 aval += 8; 599 j += 3; 600 break; 601 case 2: 602 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 603 acolidx += 8; 604 aval += 8; 605 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 606 acolidx += 8; 607 aval += 8; 608 j += 2; 609 break; 610 case 1: 611 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 612 acolidx += 8; 613 aval += 8; 614 j += 1; 615 break; 616 } 617 #pragma novector 618 for (; j < (a->sliidx[i + 1] >> 3); j += 4) { 619 AVX512_Mult_Private(vec_idx, vec_x, vec_vals, vec_y); 620 acolidx += 8; 621 aval += 8; 622 AVX512_Mult_Private(vec_idx2, vec_x2, vec_vals2, vec_y2); 623 acolidx += 8; 624 aval += 8; 625 AVX512_Mult_Private(vec_idx3, vec_x3, vec_vals3, vec_y3); 626 acolidx += 8; 627 aval += 8; 628 AVX512_Mult_Private(vec_idx4, vec_x4, vec_vals4, vec_y4); 629 acolidx += 8; 630 aval += 8; 631 } 632 633 vec_y = _mm512_add_pd(vec_y, vec_y2); 634 vec_y = _mm512_add_pd(vec_y, vec_y3); 635 vec_y = _mm512_add_pd(vec_y, vec_y4); 636 if (i == totalslices - 1 && A->rmap->n & 0x07) { /* if last slice has padding rows */ 637 _mm512_mask_storeu_pd(&z[8 * i], mask, vec_y); 638 } else { 639 _mm512_storeu_pd(&z[8 * i], vec_y); 640 } 641 } 642 #elif defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 643 PetscCheck(a->sliceheight == 8, PETSC_COMM_SELF, PETSC_ERR_SUP, "The kernel requires a slice height of 8, but the input matrix has a slice height of %" PetscInt_FMT, a->sliceheight); 644 for (i = 0; i < totalslices; i++) { /* loop over full slices */ 645 PetscPrefetchBlock(acolidx, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 646 PetscPrefetchBlock(aval, a->sliidx[i + 1] - a->sliidx[i], 0, PETSC_PREFETCH_HINT_T0); 647 648 /* last slice may have padding rows. Don't use vectorization. */ 649 if (i == totalslices - 1 && (A->rmap->n & 0x07)) { 650 for (r = 0; r < (A->rmap->n & 0x07); ++r) { 651 row = 8 * i + r; 652 yval = (MatScalar)0.0; 653 nnz_in_row = a->rlen[row]; 654 for (j = 0; j < nnz_in_row; ++j) yval += aval[8 * j + r] * x[acolidx[8 * j + r]]; 655 z[row] = y[row] + yval; 656 } 657 break; 658 } 659 660 vec_y = _mm256_loadu_pd(y + 8 * i); 661 vec_y2 = _mm256_loadu_pd(y + 8 * i + 4); 662 663 /* Process slice of height 8 (512 bits) via two subslices of height 4 (256 bits) via AVX */ 664 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j += 8) { 665 vec_vals = _mm256_loadu_pd(aval); 666 vec_x_tmp = _mm_setzero_pd(); 667 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 668 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 669 vec_x = _mm256_setzero_pd(); 670 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 671 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 672 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 673 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 674 vec_y = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y); 675 aval += 4; 676 677 vec_vals = _mm256_loadu_pd(aval); 678 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 679 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 680 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 0); 681 vec_x_tmp = _mm_loadl_pd(vec_x_tmp, x + *acolidx++); 682 vec_x_tmp = _mm_loadh_pd(vec_x_tmp, x + *acolidx++); 683 vec_x = _mm256_insertf128_pd(vec_x, vec_x_tmp, 1); 684 vec_y2 = _mm256_add_pd(_mm256_mul_pd(vec_x, vec_vals), vec_y2); 685 aval += 4; 686 } 687 688 _mm256_storeu_pd(z + i * 8, vec_y); 689 _mm256_storeu_pd(z + i * 8 + 4, vec_y2); 690 } 691 #else 692 PetscCall(PetscMalloc1(sliceheight, &sum)); 693 for (i = 0; i < totalslices; i++) { /* loop over slices */ 694 for (j = 0; j < sliceheight; j++) { 695 sum[j] = 0.0; 696 for (k = a->sliidx[i] + j; k < a->sliidx[i + 1]; k += sliceheight) sum[j] += aval[k] * x[acolidx[k]]; 697 } 698 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 699 for (j = 0; j < (A->rmap->n % sliceheight); j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 700 } else { 701 for (j = 0; j < sliceheight; j++) z[sliceheight * i + j] = y[sliceheight * i + j] + sum[j]; 702 } 703 } 704 PetscCall(PetscFree(sum)); 705 #endif 706 707 PetscCall(PetscLogFlops(2.0 * a->nz)); 708 PetscCall(VecRestoreArrayRead(xx, &x)); 709 PetscCall(VecRestoreArrayPair(yy, zz, &y, &z)); 710 PetscFunctionReturn(PETSC_SUCCESS); 711 } 712 713 PetscErrorCode MatMultTransposeAdd_SeqSELL(Mat A, Vec xx, Vec zz, Vec yy) 714 { 715 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 716 PetscScalar *y; 717 const PetscScalar *x; 718 const MatScalar *aval = a->val; 719 const PetscInt *acolidx = a->colidx; 720 PetscInt i, j, r, row, nnz_in_row, totalslices = a->totalslices, sliceheight = a->sliceheight; 721 722 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 723 #pragma disjoint(*x, *y, *aval) 724 #endif 725 726 PetscFunctionBegin; 727 if (A->symmetric == PETSC_BOOL3_TRUE) { 728 PetscCall(MatMultAdd_SeqSELL(A, xx, zz, yy)); 729 PetscFunctionReturn(PETSC_SUCCESS); 730 } 731 if (zz != yy) PetscCall(VecCopy(zz, yy)); 732 733 if (a->nz) { 734 PetscCall(VecGetArrayRead(xx, &x)); 735 PetscCall(VecGetArray(yy, &y)); 736 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 737 if (i == totalslices - 1 && (A->rmap->n % sliceheight)) { 738 for (r = 0; r < (A->rmap->n % sliceheight); ++r) { 739 row = sliceheight * i + r; 740 nnz_in_row = a->rlen[row]; 741 for (j = 0; j < nnz_in_row; ++j) y[acolidx[sliceheight * j + r]] += aval[sliceheight * j + r] * x[row]; 742 } 743 break; 744 } 745 for (r = 0; r < sliceheight; ++r) 746 for (j = a->sliidx[i] + r; j < a->sliidx[i + 1]; j += sliceheight) y[acolidx[j]] += aval[j] * x[sliceheight * i + r]; 747 } 748 PetscCall(PetscLogFlops(2.0 * a->nz)); 749 PetscCall(VecRestoreArrayRead(xx, &x)); 750 PetscCall(VecRestoreArray(yy, &y)); 751 } 752 PetscFunctionReturn(PETSC_SUCCESS); 753 } 754 755 PetscErrorCode MatMultTranspose_SeqSELL(Mat A, Vec xx, Vec yy) 756 { 757 PetscFunctionBegin; 758 if (A->symmetric == PETSC_BOOL3_TRUE) { 759 PetscCall(MatMult_SeqSELL(A, xx, yy)); 760 } else { 761 PetscCall(VecSet(yy, 0.0)); 762 PetscCall(MatMultTransposeAdd_SeqSELL(A, xx, yy, yy)); 763 } 764 PetscFunctionReturn(PETSC_SUCCESS); 765 } 766 767 /* 768 Checks for missing diagonals 769 */ 770 PetscErrorCode MatMissingDiagonal_SeqSELL(Mat A, PetscBool *missing, PetscInt *d) 771 { 772 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 773 PetscInt *diag, i; 774 775 PetscFunctionBegin; 776 *missing = PETSC_FALSE; 777 if (A->rmap->n > 0 && !a->colidx) { 778 *missing = PETSC_TRUE; 779 if (d) *d = 0; 780 PetscCall(PetscInfo(A, "Matrix has no entries therefore is missing diagonal\n")); 781 } else { 782 diag = a->diag; 783 for (i = 0; i < A->rmap->n; i++) { 784 if (diag[i] == -1) { 785 *missing = PETSC_TRUE; 786 if (d) *d = i; 787 PetscCall(PetscInfo(A, "Matrix is missing diagonal number %" PetscInt_FMT "\n", i)); 788 break; 789 } 790 } 791 } 792 PetscFunctionReturn(PETSC_SUCCESS); 793 } 794 795 PetscErrorCode MatMarkDiagonal_SeqSELL(Mat A) 796 { 797 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 798 const PetscInt m = A->rmap->n; 799 PetscInt shift; 800 801 PetscFunctionBegin; 802 if (a->diag && a->diagNonzeroState == A->nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); 803 if (!a->diag) { 804 PetscCall(PetscMalloc1(m, &a->diag)); 805 a->free_diag = PETSC_TRUE; 806 } 807 a->diagDense = PETSC_TRUE; 808 for (PetscInt i = 0; i < m; i++) { /* loop over rows */ 809 PetscBool found = PETSC_FALSE; 810 811 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 812 a->diag[i] = -1; 813 for (PetscInt j = 0; j < a->rlen[i]; j++) { 814 if (a->colidx[shift + a->sliceheight * j] == i) { 815 a->diag[i] = shift + a->sliceheight * j; 816 found = PETSC_TRUE; 817 break; 818 } 819 } 820 if (!found) a->diagDense = PETSC_FALSE; 821 } 822 a->diagNonzeroState = A->nonzerostate; 823 PetscFunctionReturn(PETSC_SUCCESS); 824 } 825 826 /* 827 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 828 */ 829 static PetscErrorCode MatInvertDiagonalForSOR_SeqSELL(Mat A, PetscScalar omega, PetscScalar fshift) 830 { 831 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 832 PetscInt i, *diag, m = A->rmap->n; 833 MatScalar *val = a->val; 834 PetscScalar *idiag, *mdiag; 835 836 PetscFunctionBegin; 837 if (a->idiagState == ((PetscObject)A)->state && a->omega == omega && a->fshift == fshift) PetscFunctionReturn(PETSC_SUCCESS); 838 PetscCall(MatMarkDiagonal_SeqSELL(A)); 839 PetscCheck(a->diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must have all diagonal location to invert them"); 840 diag = a->diag; 841 if (!a->idiag) { 842 PetscCall(PetscMalloc3(m, &a->idiag, m, &a->mdiag, m, &a->ssor_work)); 843 val = a->val; 844 } 845 mdiag = a->mdiag; 846 idiag = a->idiag; 847 848 if (omega == 1.0 && PetscRealPart(fshift) <= 0.0) { 849 for (i = 0; i < m; i++) { 850 mdiag[i] = val[diag[i]]; 851 if (!PetscAbsScalar(mdiag[i])) { /* zero diagonal */ 852 PetscCheck(PetscRealPart(fshift), PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Zero diagonal on row %" PetscInt_FMT, i); 853 PetscCall(PetscInfo(A, "Zero diagonal on row %" PetscInt_FMT "\n", i)); 854 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 855 A->factorerror_zeropivot_value = 0.0; 856 A->factorerror_zeropivot_row = i; 857 } 858 idiag[i] = 1.0 / val[diag[i]]; 859 } 860 PetscCall(PetscLogFlops(m)); 861 } else { 862 for (i = 0; i < m; i++) { 863 mdiag[i] = val[diag[i]]; 864 idiag[i] = omega / (fshift + val[diag[i]]); 865 } 866 PetscCall(PetscLogFlops(2.0 * m)); 867 } 868 a->idiagState = ((PetscObject)A)->state; 869 a->omega = omega; 870 a->fshift = fshift; 871 PetscFunctionReturn(PETSC_SUCCESS); 872 } 873 874 PetscErrorCode MatZeroEntries_SeqSELL(Mat A) 875 { 876 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 877 878 PetscFunctionBegin; 879 PetscCall(PetscArrayzero(a->val, a->sliidx[a->totalslices])); 880 PetscFunctionReturn(PETSC_SUCCESS); 881 } 882 883 PetscErrorCode MatDestroy_SeqSELL(Mat A) 884 { 885 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 886 887 PetscFunctionBegin; 888 PetscCall(PetscLogObjectState((PetscObject)A, "Rows=%" PetscInt_FMT ", Cols=%" PetscInt_FMT ", NZ=%" PetscInt_FMT, A->rmap->n, A->cmap->n, a->nz)); 889 PetscCall(MatSeqXSELLFreeSELL(A, &a->val, &a->colidx)); 890 PetscCall(ISDestroy(&a->row)); 891 PetscCall(ISDestroy(&a->col)); 892 PetscCall(PetscFree(a->diag)); 893 PetscCall(PetscFree(a->rlen)); 894 PetscCall(PetscFree(a->sliidx)); 895 PetscCall(PetscFree3(a->idiag, a->mdiag, a->ssor_work)); 896 PetscCall(PetscFree(a->solve_work)); 897 PetscCall(ISDestroy(&a->icol)); 898 PetscCall(PetscFree(a->saved_values)); 899 PetscCall(PetscFree2(a->getrowcols, a->getrowvals)); 900 PetscCall(PetscFree(A->data)); 901 #if defined(PETSC_HAVE_CUPM) 902 PetscCall(PetscFree(a->chunk_slice_map)); 903 #endif 904 905 PetscCall(PetscObjectChangeTypeName((PetscObject)A, NULL)); 906 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatStoreValues_C", NULL)); 907 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatRetrieveValues_C", NULL)); 908 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetPreallocation_C", NULL)); 909 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetArray_C", NULL)); 910 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLRestoreArray_C", NULL)); 911 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqaij_C", NULL)); 912 #if defined(PETSC_HAVE_CUDA) 913 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellcuda_C", NULL)); 914 #endif 915 #if defined(PETSC_HAVE_HIP) 916 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqsell_seqsellhip_C", NULL)); 917 #endif 918 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetFillRatio_C", NULL)); 919 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetMaxSliceWidth_C", NULL)); 920 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetAvgSliceWidth_C", NULL)); 921 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLGetVarSliceSize_C", NULL)); 922 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqSELLSetSliceHeight_C", NULL)); 923 PetscFunctionReturn(PETSC_SUCCESS); 924 } 925 926 PetscErrorCode MatSetOption_SeqSELL(Mat A, MatOption op, PetscBool flg) 927 { 928 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 929 930 PetscFunctionBegin; 931 switch (op) { 932 case MAT_ROW_ORIENTED: 933 a->roworiented = flg; 934 break; 935 case MAT_KEEP_NONZERO_PATTERN: 936 a->keepnonzeropattern = flg; 937 break; 938 case MAT_NEW_NONZERO_LOCATIONS: 939 a->nonew = (flg ? 0 : 1); 940 break; 941 case MAT_NEW_NONZERO_LOCATION_ERR: 942 a->nonew = (flg ? -1 : 0); 943 break; 944 case MAT_NEW_NONZERO_ALLOCATION_ERR: 945 a->nonew = (flg ? -2 : 0); 946 break; 947 case MAT_UNUSED_NONZERO_LOCATION_ERR: 948 a->nounused = (flg ? -1 : 0); 949 break; 950 default: 951 break; 952 } 953 PetscFunctionReturn(PETSC_SUCCESS); 954 } 955 956 PetscErrorCode MatGetDiagonal_SeqSELL(Mat A, Vec v) 957 { 958 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 959 PetscInt i, j, n, shift; 960 PetscScalar *x, zero = 0.0; 961 962 PetscFunctionBegin; 963 PetscCall(VecGetLocalSize(v, &n)); 964 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 965 966 if (A->factortype == MAT_FACTOR_ILU || A->factortype == MAT_FACTOR_LU) { 967 PetscInt *diag = a->diag; 968 PetscCall(VecGetArray(v, &x)); 969 for (i = 0; i < n; i++) x[i] = 1.0 / a->val[diag[i]]; 970 PetscCall(VecRestoreArray(v, &x)); 971 PetscFunctionReturn(PETSC_SUCCESS); 972 } 973 974 PetscCall(VecSet(v, zero)); 975 PetscCall(VecGetArray(v, &x)); 976 for (i = 0; i < n; i++) { /* loop over rows */ 977 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 978 x[i] = 0; 979 for (j = 0; j < a->rlen[i]; j++) { 980 if (a->colidx[shift + a->sliceheight * j] == i) { 981 x[i] = a->val[shift + a->sliceheight * j]; 982 break; 983 } 984 } 985 } 986 PetscCall(VecRestoreArray(v, &x)); 987 PetscFunctionReturn(PETSC_SUCCESS); 988 } 989 990 PetscErrorCode MatDiagonalScale_SeqSELL(Mat A, Vec ll, Vec rr) 991 { 992 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 993 const PetscScalar *l, *r; 994 PetscInt i, j, m, n, row; 995 996 PetscFunctionBegin; 997 if (ll) { 998 /* The local size is used so that VecMPI can be passed to this routine 999 by MatDiagonalScale_MPISELL */ 1000 PetscCall(VecGetLocalSize(ll, &m)); 1001 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1002 PetscCall(VecGetArrayRead(ll, &l)); 1003 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1004 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1005 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1006 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= l[a->sliceheight * i + row]; 1007 } 1008 } else { 1009 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) a->val[j] *= l[a->sliceheight * i + row]; 1010 } 1011 } 1012 PetscCall(VecRestoreArrayRead(ll, &l)); 1013 PetscCall(PetscLogFlops(a->nz)); 1014 } 1015 if (rr) { 1016 PetscCall(VecGetLocalSize(rr, &n)); 1017 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1018 PetscCall(VecGetArrayRead(rr, &r)); 1019 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1020 if (i == a->totalslices - 1 && (A->rmap->n % a->sliceheight)) { /* if last slice has padding rows */ 1021 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = ((row + 1) % a->sliceheight)) { 1022 if (row < (A->rmap->n % a->sliceheight)) a->val[j] *= r[a->colidx[j]]; 1023 } 1024 } else { 1025 for (j = a->sliidx[i]; j < a->sliidx[i + 1]; j++) a->val[j] *= r[a->colidx[j]]; 1026 } 1027 } 1028 PetscCall(VecRestoreArrayRead(rr, &r)); 1029 PetscCall(PetscLogFlops(a->nz)); 1030 } 1031 #if defined(PETSC_HAVE_CUPM) 1032 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 1033 #endif 1034 PetscFunctionReturn(PETSC_SUCCESS); 1035 } 1036 1037 PetscErrorCode MatGetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], PetscScalar v[]) 1038 { 1039 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1040 PetscInt *cp, i, k, low, high, t, row, col, l; 1041 PetscInt shift; 1042 MatScalar *vp; 1043 1044 PetscFunctionBegin; 1045 for (k = 0; k < m; k++) { /* loop over requested rows */ 1046 row = im[k]; 1047 if (row < 0) continue; 1048 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1049 shift = a->sliidx[row / a->sliceheight] + (row % a->sliceheight); /* starting index of the row */ 1050 cp = a->colidx + shift; /* pointer to the row */ 1051 vp = a->val + shift; /* pointer to the row */ 1052 for (l = 0; l < n; l++) { /* loop over requested columns */ 1053 col = in[l]; 1054 if (col < 0) continue; 1055 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Column too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1056 high = a->rlen[row]; 1057 low = 0; /* assume unsorted */ 1058 while (high - low > 5) { 1059 t = (low + high) / 2; 1060 if (*(cp + a->sliceheight * t) > col) high = t; 1061 else low = t; 1062 } 1063 for (i = low; i < high; i++) { 1064 if (*(cp + a->sliceheight * i) > col) break; 1065 if (*(cp + a->sliceheight * i) == col) { 1066 *v++ = *(vp + a->sliceheight * i); 1067 goto finished; 1068 } 1069 } 1070 *v++ = 0.0; 1071 finished:; 1072 } 1073 } 1074 PetscFunctionReturn(PETSC_SUCCESS); 1075 } 1076 1077 static PetscErrorCode MatView_SeqSELL_ASCII(Mat A, PetscViewer viewer) 1078 { 1079 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1080 PetscInt i, j, m = A->rmap->n, shift; 1081 const char *name; 1082 PetscViewerFormat format; 1083 1084 PetscFunctionBegin; 1085 PetscCall(PetscViewerGetFormat(viewer, &format)); 1086 if (format == PETSC_VIEWER_ASCII_MATLAB) { 1087 PetscInt nofinalvalue = 0; 1088 /* 1089 if (m && ((a->i[m] == a->i[m-1]) || (a->j[a->nz-1] != A->cmap->n-1))) nofinalvalue = 1; 1090 */ 1091 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1092 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Size = %" PetscInt_FMT " %" PetscInt_FMT " \n", m, A->cmap->n)); 1093 PetscCall(PetscViewerASCIIPrintf(viewer, "%% Nonzeros = %" PetscInt_FMT " \n", a->nz)); 1094 #if defined(PETSC_USE_COMPLEX) 1095 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",4);\n", a->nz + nofinalvalue)); 1096 #else 1097 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = zeros(%" PetscInt_FMT ",3);\n", a->nz + nofinalvalue)); 1098 #endif 1099 PetscCall(PetscViewerASCIIPrintf(viewer, "zzz = [\n")); 1100 1101 for (i = 0; i < m; i++) { 1102 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1103 for (j = 0; j < a->rlen[i]; j++) { 1104 #if defined(PETSC_USE_COMPLEX) 1105 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1106 #else 1107 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n", i + 1, a->colidx[shift + a->sliceheight * j] + 1, (double)a->val[shift + a->sliceheight * j])); 1108 #endif 1109 } 1110 } 1111 /* 1112 if (nofinalvalue) { 1113 #if defined(PETSC_USE_COMPLEX) 1114 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e %18.16e\n",m,A->cmap->n,0.,0.)); 1115 #else 1116 PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT " %" PetscInt_FMT " %18.16e\n",m,A->cmap->n,0.0)); 1117 #endif 1118 } 1119 */ 1120 PetscCall(PetscObjectGetName((PetscObject)A, &name)); 1121 PetscCall(PetscViewerASCIIPrintf(viewer, "];\n %s = spconvert(zzz);\n", name)); 1122 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1123 } else if (format == PETSC_VIEWER_ASCII_FACTOR_INFO || format == PETSC_VIEWER_ASCII_INFO) { 1124 PetscFunctionReturn(PETSC_SUCCESS); 1125 } else if (format == PETSC_VIEWER_ASCII_COMMON) { 1126 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1127 for (i = 0; i < m; i++) { 1128 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1129 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1130 for (j = 0; j < a->rlen[i]; j++) { 1131 #if defined(PETSC_USE_COMPLEX) 1132 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1133 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1134 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0 && PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1135 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1136 } else if (PetscRealPart(a->val[shift + a->sliceheight * j]) != 0.0) { 1137 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1138 } 1139 #else 1140 if (a->val[shift + a->sliceheight * j] != 0.0) PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1141 #endif 1142 } 1143 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1144 } 1145 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1146 } else if (format == PETSC_VIEWER_ASCII_DENSE) { 1147 PetscInt cnt = 0, jcnt; 1148 PetscScalar value; 1149 #if defined(PETSC_USE_COMPLEX) 1150 PetscBool realonly = PETSC_TRUE; 1151 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1152 if (PetscImaginaryPart(a->val[i]) != 0.0) { 1153 realonly = PETSC_FALSE; 1154 break; 1155 } 1156 } 1157 #endif 1158 1159 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1160 for (i = 0; i < m; i++) { 1161 jcnt = 0; 1162 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1163 for (j = 0; j < A->cmap->n; j++) { 1164 if (jcnt < a->rlen[i] && j == a->colidx[shift + a->sliceheight * j]) { 1165 value = a->val[cnt++]; 1166 jcnt++; 1167 } else { 1168 value = 0.0; 1169 } 1170 #if defined(PETSC_USE_COMPLEX) 1171 if (realonly) { 1172 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)PetscRealPart(value))); 1173 } else { 1174 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e+%7.5e i ", (double)PetscRealPart(value), (double)PetscImaginaryPart(value))); 1175 } 1176 #else 1177 PetscCall(PetscViewerASCIIPrintf(viewer, " %7.5e ", (double)value)); 1178 #endif 1179 } 1180 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1181 } 1182 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1183 } else if (format == PETSC_VIEWER_ASCII_MATRIXMARKET) { 1184 PetscInt fshift = 1; 1185 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1186 #if defined(PETSC_USE_COMPLEX) 1187 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate complex general\n")); 1188 #else 1189 PetscCall(PetscViewerASCIIPrintf(viewer, "%%%%MatrixMarket matrix coordinate real general\n")); 1190 #endif 1191 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, A->cmap->n, a->nz)); 1192 for (i = 0; i < m; i++) { 1193 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1194 for (j = 0; j < a->rlen[i]; j++) { 1195 #if defined(PETSC_USE_COMPLEX) 1196 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1197 #else 1198 PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT " %" PetscInt_FMT " %g\n", i + fshift, a->colidx[shift + a->sliceheight * j] + fshift, (double)a->val[shift + a->sliceheight * j])); 1199 #endif 1200 } 1201 } 1202 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1203 } else if (format == PETSC_VIEWER_NATIVE) { 1204 for (i = 0; i < a->totalslices; i++) { /* loop over slices */ 1205 PetscInt row; 1206 PetscCall(PetscViewerASCIIPrintf(viewer, "slice %" PetscInt_FMT ": %" PetscInt_FMT " %" PetscInt_FMT "\n", i, a->sliidx[i], a->sliidx[i + 1])); 1207 for (j = a->sliidx[i], row = 0; j < a->sliidx[i + 1]; j++, row = (row + 1) % a->sliceheight) { 1208 #if defined(PETSC_USE_COMPLEX) 1209 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1210 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g + %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1211 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1212 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g - %g i\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]), -(double)PetscImaginaryPart(a->val[j]))); 1213 } else { 1214 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)PetscRealPart(a->val[j]))); 1215 } 1216 #else 1217 PetscCall(PetscViewerASCIIPrintf(viewer, " %" PetscInt_FMT " %" PetscInt_FMT " %g\n", a->sliceheight * i + row, a->colidx[j], (double)a->val[j])); 1218 #endif 1219 } 1220 } 1221 } else { 1222 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_FALSE)); 1223 if (A->factortype) { 1224 for (i = 0; i < m; i++) { 1225 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1226 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1227 /* L part */ 1228 for (j = shift; j < a->diag[i]; j += a->sliceheight) { 1229 #if defined(PETSC_USE_COMPLEX) 1230 if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) > 0.0) { 1231 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1232 } else if (PetscImaginaryPart(a->val[shift + a->sliceheight * j]) < 0.0) { 1233 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1234 } else { 1235 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1236 } 1237 #else 1238 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1239 #endif 1240 } 1241 /* diagonal */ 1242 j = a->diag[i]; 1243 #if defined(PETSC_USE_COMPLEX) 1244 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1245 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)PetscImaginaryPart(1.0 / a->val[j]))); 1246 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1247 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]), (double)(-PetscImaginaryPart(1.0 / a->val[j])))); 1248 } else { 1249 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(1.0 / a->val[j]))); 1250 } 1251 #else 1252 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)(1 / a->val[j]))); 1253 #endif 1254 1255 /* U part */ 1256 for (j = a->diag[i] + 1; j < shift + a->sliceheight * a->rlen[i]; j += a->sliceheight) { 1257 #if defined(PETSC_USE_COMPLEX) 1258 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1259 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)PetscImaginaryPart(a->val[j]))); 1260 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1261 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[j], (double)PetscRealPart(a->val[j]), (double)(-PetscImaginaryPart(a->val[j])))); 1262 } else { 1263 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)PetscRealPart(a->val[j]))); 1264 } 1265 #else 1266 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[j], (double)a->val[j])); 1267 #endif 1268 } 1269 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1270 } 1271 } else { 1272 for (i = 0; i < m; i++) { 1273 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1274 PetscCall(PetscViewerASCIIPrintf(viewer, "row %" PetscInt_FMT ":", i)); 1275 for (j = 0; j < a->rlen[i]; j++) { 1276 #if defined(PETSC_USE_COMPLEX) 1277 if (PetscImaginaryPart(a->val[j]) > 0.0) { 1278 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g + %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1279 } else if (PetscImaginaryPart(a->val[j]) < 0.0) { 1280 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g - %g i)", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]), (double)-PetscImaginaryPart(a->val[shift + a->sliceheight * j]))); 1281 } else { 1282 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)PetscRealPart(a->val[shift + a->sliceheight * j]))); 1283 } 1284 #else 1285 PetscCall(PetscViewerASCIIPrintf(viewer, " (%" PetscInt_FMT ", %g) ", a->colidx[shift + a->sliceheight * j], (double)a->val[shift + a->sliceheight * j])); 1286 #endif 1287 } 1288 PetscCall(PetscViewerASCIIPrintf(viewer, "\n")); 1289 } 1290 } 1291 PetscCall(PetscViewerASCIIUseTabs(viewer, PETSC_TRUE)); 1292 } 1293 PetscCall(PetscViewerFlush(viewer)); 1294 PetscFunctionReturn(PETSC_SUCCESS); 1295 } 1296 1297 #include <petscdraw.h> 1298 static PetscErrorCode MatView_SeqSELL_Draw_Zoom(PetscDraw draw, void *Aa) 1299 { 1300 Mat A = (Mat)Aa; 1301 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1302 PetscInt i, j, m = A->rmap->n, shift; 1303 int color; 1304 PetscReal xl, yl, xr, yr, x_l, x_r, y_l, y_r; 1305 PetscViewer viewer; 1306 PetscViewerFormat format; 1307 1308 PetscFunctionBegin; 1309 PetscCall(PetscObjectQuery((PetscObject)A, "Zoomviewer", (PetscObject *)&viewer)); 1310 PetscCall(PetscViewerGetFormat(viewer, &format)); 1311 PetscCall(PetscDrawGetCoordinates(draw, &xl, &yl, &xr, &yr)); 1312 1313 /* loop over matrix elements drawing boxes */ 1314 1315 if (format != PETSC_VIEWER_DRAW_CONTOUR) { 1316 PetscDrawCollectiveBegin(draw); 1317 /* Blue for negative, Cyan for zero and Red for positive */ 1318 color = PETSC_DRAW_BLUE; 1319 for (i = 0; i < m; i++) { 1320 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1321 y_l = m - i - 1.0; 1322 y_r = y_l + 1.0; 1323 for (j = 0; j < a->rlen[i]; j++) { 1324 x_l = a->colidx[shift + a->sliceheight * j]; 1325 x_r = x_l + 1.0; 1326 if (PetscRealPart(a->val[shift + a->sliceheight * j]) >= 0.) continue; 1327 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1328 } 1329 } 1330 color = PETSC_DRAW_CYAN; 1331 for (i = 0; i < m; i++) { 1332 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1333 y_l = m - i - 1.0; 1334 y_r = y_l + 1.0; 1335 for (j = 0; j < a->rlen[i]; j++) { 1336 x_l = a->colidx[shift + a->sliceheight * j]; 1337 x_r = x_l + 1.0; 1338 if (a->val[shift + a->sliceheight * j] != 0.) continue; 1339 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1340 } 1341 } 1342 color = PETSC_DRAW_RED; 1343 for (i = 0; i < m; i++) { 1344 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1345 y_l = m - i - 1.0; 1346 y_r = y_l + 1.0; 1347 for (j = 0; j < a->rlen[i]; j++) { 1348 x_l = a->colidx[shift + a->sliceheight * j]; 1349 x_r = x_l + 1.0; 1350 if (PetscRealPart(a->val[shift + a->sliceheight * j]) <= 0.) continue; 1351 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1352 } 1353 } 1354 PetscDrawCollectiveEnd(draw); 1355 } else { 1356 /* use contour shading to indicate magnitude of values */ 1357 /* first determine max of all nonzero values */ 1358 PetscReal minv = 0.0, maxv = 0.0; 1359 PetscInt count = 0; 1360 PetscDraw popup; 1361 for (i = 0; i < a->sliidx[a->totalslices]; i++) { 1362 if (PetscAbsScalar(a->val[i]) > maxv) maxv = PetscAbsScalar(a->val[i]); 1363 } 1364 if (minv >= maxv) maxv = minv + PETSC_SMALL; 1365 PetscCall(PetscDrawGetPopup(draw, &popup)); 1366 PetscCall(PetscDrawScalePopup(popup, minv, maxv)); 1367 1368 PetscDrawCollectiveBegin(draw); 1369 for (i = 0; i < m; i++) { 1370 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; 1371 y_l = m - i - 1.0; 1372 y_r = y_l + 1.0; 1373 for (j = 0; j < a->rlen[i]; j++) { 1374 x_l = a->colidx[shift + a->sliceheight * j]; 1375 x_r = x_l + 1.0; 1376 color = PetscDrawRealToColor(PetscAbsScalar(a->val[count]), minv, maxv); 1377 PetscCall(PetscDrawRectangle(draw, x_l, y_l, x_r, y_r, color, color, color, color)); 1378 count++; 1379 } 1380 } 1381 PetscDrawCollectiveEnd(draw); 1382 } 1383 PetscFunctionReturn(PETSC_SUCCESS); 1384 } 1385 1386 #include <petscdraw.h> 1387 static PetscErrorCode MatView_SeqSELL_Draw(Mat A, PetscViewer viewer) 1388 { 1389 PetscDraw draw; 1390 PetscReal xr, yr, xl, yl, h, w; 1391 PetscBool isnull; 1392 1393 PetscFunctionBegin; 1394 PetscCall(PetscViewerDrawGetDraw(viewer, 0, &draw)); 1395 PetscCall(PetscDrawIsNull(draw, &isnull)); 1396 if (isnull) PetscFunctionReturn(PETSC_SUCCESS); 1397 1398 xr = A->cmap->n; 1399 yr = A->rmap->n; 1400 h = yr / 10.0; 1401 w = xr / 10.0; 1402 xr += w; 1403 yr += h; 1404 xl = -w; 1405 yl = -h; 1406 PetscCall(PetscDrawSetCoordinates(draw, xl, yl, xr, yr)); 1407 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", (PetscObject)viewer)); 1408 PetscCall(PetscDrawZoom(draw, MatView_SeqSELL_Draw_Zoom, A)); 1409 PetscCall(PetscObjectCompose((PetscObject)A, "Zoomviewer", NULL)); 1410 PetscCall(PetscDrawSave(draw)); 1411 PetscFunctionReturn(PETSC_SUCCESS); 1412 } 1413 1414 PetscErrorCode MatView_SeqSELL(Mat A, PetscViewer viewer) 1415 { 1416 PetscBool isascii, isbinary, isdraw; 1417 1418 PetscFunctionBegin; 1419 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &isascii)); 1420 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERBINARY, &isbinary)); 1421 PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERDRAW, &isdraw)); 1422 if (isascii) { 1423 PetscCall(MatView_SeqSELL_ASCII(A, viewer)); 1424 } else if (isbinary) { 1425 /* PetscCall(MatView_SeqSELL_Binary(A,viewer)); */ 1426 } else if (isdraw) PetscCall(MatView_SeqSELL_Draw(A, viewer)); 1427 PetscFunctionReturn(PETSC_SUCCESS); 1428 } 1429 1430 PetscErrorCode MatAssemblyEnd_SeqSELL(Mat A, MatAssemblyType mode) 1431 { 1432 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1433 PetscInt i, shift, row_in_slice, row, nrow, *cp, lastcol, j, k; 1434 MatScalar *vp; 1435 #if defined(PETSC_HAVE_CUPM) 1436 PetscInt totalchunks = 0; 1437 #endif 1438 1439 PetscFunctionBegin; 1440 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 1441 /* To do: compress out the unused elements */ 1442 PetscCall(MatMarkDiagonal_SeqSELL(A)); 1443 PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: %" PetscInt_FMT " allocated %" PetscInt_FMT " used (%" PetscInt_FMT " nonzeros+%" PetscInt_FMT " paddedzeros)\n", A->rmap->n, A->cmap->n, a->maxallocmat, a->sliidx[a->totalslices], a->nz, a->sliidx[a->totalslices] - a->nz)); 1444 PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is %" PetscInt_FMT "\n", a->reallocs)); 1445 PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rlenmax)); 1446 a->nonzerorowcnt = 0; 1447 /* Set unused slots for column indices to last valid column index. Set unused slots for values to zero. This allows for a use of unmasked intrinsics -> higher performance */ 1448 for (i = 0; i < a->totalslices; ++i) { 1449 shift = a->sliidx[i]; /* starting index of the slice */ 1450 cp = PetscSafePointerPlusOffset(a->colidx, shift); /* pointer to the column indices of the slice */ 1451 vp = PetscSafePointerPlusOffset(a->val, shift); /* pointer to the nonzero values of the slice */ 1452 for (row_in_slice = 0; row_in_slice < a->sliceheight; ++row_in_slice) { /* loop over rows in the slice */ 1453 row = a->sliceheight * i + row_in_slice; 1454 nrow = a->rlen[row]; /* number of nonzeros in row */ 1455 /* 1456 Search for the nearest nonzero. Normally setting the index to zero may cause extra communication. 1457 But if the entire slice are empty, it is fine to use 0 since the index will not be loaded. 1458 */ 1459 lastcol = 0; 1460 if (nrow > 0) { /* nonempty row */ 1461 a->nonzerorowcnt++; 1462 lastcol = cp[a->sliceheight * (nrow - 1) + row_in_slice]; /* use the index from the last nonzero at current row */ 1463 } else if (!row_in_slice) { /* first row of the correct slice is empty */ 1464 for (j = 1; j < a->sliceheight; j++) { 1465 if (a->rlen[a->sliceheight * i + j]) { 1466 lastcol = cp[j]; 1467 break; 1468 } 1469 } 1470 } else { 1471 if (a->sliidx[i + 1] != shift) lastcol = cp[row_in_slice - 1]; /* use the index from the previous row */ 1472 } 1473 1474 for (k = nrow; k < (a->sliidx[i + 1] - shift) / a->sliceheight; ++k) { 1475 cp[a->sliceheight * k + row_in_slice] = lastcol; 1476 vp[a->sliceheight * k + row_in_slice] = (MatScalar)0; 1477 } 1478 } 1479 } 1480 1481 A->info.mallocs += a->reallocs; 1482 a->reallocs = 0; 1483 1484 #if defined(PETSC_HAVE_CUPM) 1485 if (!a->chunksize && a->totalslices) { 1486 a->chunksize = 64; 1487 while (a->chunksize < 1024 && 2 * a->chunksize <= a->sliidx[a->totalslices] / a->totalslices) a->chunksize *= 2; 1488 totalchunks = 1 + (a->sliidx[a->totalslices] - 1) / a->chunksize; 1489 } 1490 if (totalchunks != a->totalchunks) { 1491 PetscCall(PetscFree(a->chunk_slice_map)); 1492 PetscCall(PetscMalloc1(totalchunks, &a->chunk_slice_map)); 1493 a->totalchunks = totalchunks; 1494 } 1495 j = 0; 1496 for (i = 0; i < totalchunks; i++) { 1497 while (a->sliidx[j + 1] <= i * a->chunksize && j < a->totalslices) j++; 1498 a->chunk_slice_map[i] = j; 1499 } 1500 #endif 1501 PetscFunctionReturn(PETSC_SUCCESS); 1502 } 1503 1504 PetscErrorCode MatGetInfo_SeqSELL(Mat A, MatInfoType flag, MatInfo *info) 1505 { 1506 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1507 1508 PetscFunctionBegin; 1509 info->block_size = 1.0; 1510 info->nz_allocated = a->maxallocmat; 1511 info->nz_used = a->sliidx[a->totalslices]; /* include padding zeros */ 1512 info->nz_unneeded = (a->maxallocmat - a->sliidx[a->totalslices]); 1513 info->assemblies = A->num_ass; 1514 info->mallocs = A->info.mallocs; 1515 info->memory = 0; /* REVIEW ME */ 1516 if (A->factortype) { 1517 info->fill_ratio_given = A->info.fill_ratio_given; 1518 info->fill_ratio_needed = A->info.fill_ratio_needed; 1519 info->factor_mallocs = A->info.factor_mallocs; 1520 } else { 1521 info->fill_ratio_given = 0; 1522 info->fill_ratio_needed = 0; 1523 info->factor_mallocs = 0; 1524 } 1525 PetscFunctionReturn(PETSC_SUCCESS); 1526 } 1527 1528 PetscErrorCode MatSetValues_SeqSELL(Mat A, PetscInt m, const PetscInt im[], PetscInt n, const PetscInt in[], const PetscScalar v[], InsertMode is) 1529 { 1530 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1531 PetscInt shift, i, k, l, low, high, t, ii, row, col, nrow; 1532 PetscInt *cp, nonew = a->nonew, lastcol = -1; 1533 MatScalar *vp, value; 1534 #if defined(PETSC_HAVE_CUPM) 1535 PetscBool inserted = PETSC_FALSE; 1536 PetscInt mul = DEVICE_MEM_ALIGN / a->sliceheight; 1537 #endif 1538 1539 PetscFunctionBegin; 1540 for (k = 0; k < m; k++) { /* loop over added rows */ 1541 row = im[k]; 1542 if (row < 0) continue; 1543 PetscCheck(row < A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Row too large: row %" PetscInt_FMT " max %" PetscInt_FMT, row, A->rmap->n - 1); 1544 shift = a->sliidx[row / a->sliceheight] + row % a->sliceheight; /* starting index of the row */ 1545 cp = a->colidx + shift; /* pointer to the row */ 1546 vp = a->val + shift; /* pointer to the row */ 1547 nrow = a->rlen[row]; 1548 low = 0; 1549 high = nrow; 1550 1551 for (l = 0; l < n; l++) { /* loop over added columns */ 1552 col = in[l]; 1553 if (col < 0) continue; 1554 PetscCheck(col < A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Col too large: row %" PetscInt_FMT " max %" PetscInt_FMT, col, A->cmap->n - 1); 1555 if (a->roworiented) { 1556 value = v[l + k * n]; 1557 } else { 1558 value = v[k + l * m]; 1559 } 1560 if ((value == 0.0 && a->ignorezeroentries) && (is == ADD_VALUES)) continue; 1561 1562 /* search in this row for the specified column, i indicates the column to be set */ 1563 if (col <= lastcol) low = 0; 1564 else high = nrow; 1565 lastcol = col; 1566 while (high - low > 5) { 1567 t = (low + high) / 2; 1568 if (*(cp + a->sliceheight * t) > col) high = t; 1569 else low = t; 1570 } 1571 for (i = low; i < high; i++) { 1572 if (*(cp + a->sliceheight * i) > col) break; 1573 if (*(cp + a->sliceheight * i) == col) { 1574 if (is == ADD_VALUES) *(vp + a->sliceheight * i) += value; 1575 else *(vp + a->sliceheight * i) = value; 1576 #if defined(PETSC_HAVE_CUPM) 1577 inserted = PETSC_TRUE; 1578 #endif 1579 low = i + 1; 1580 goto noinsert; 1581 } 1582 } 1583 if (value == 0.0 && a->ignorezeroentries) goto noinsert; 1584 if (nonew == 1) goto noinsert; 1585 PetscCheck(nonew != -1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Inserting a new nonzero (%" PetscInt_FMT ", %" PetscInt_FMT ") in the matrix", row, col); 1586 #if defined(PETSC_HAVE_CUPM) 1587 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, mul); 1588 #else 1589 /* If the current row length exceeds the slice width (e.g. nrow==slice_width), allocate a new space, otherwise do nothing */ 1590 MatSeqXSELLReallocateSELL(A, A->rmap->n, 1, nrow, a->sliidx, a->sliceheight, row / a->sliceheight, row, col, a->colidx, a->val, cp, vp, nonew, MatScalar, 1); 1591 #endif 1592 /* add the new nonzero to the high position, shift the remaining elements in current row to the right by one slot */ 1593 for (ii = nrow - 1; ii >= i; ii--) { 1594 *(cp + a->sliceheight * (ii + 1)) = *(cp + a->sliceheight * ii); 1595 *(vp + a->sliceheight * (ii + 1)) = *(vp + a->sliceheight * ii); 1596 } 1597 a->rlen[row]++; 1598 *(cp + a->sliceheight * i) = col; 1599 *(vp + a->sliceheight * i) = value; 1600 a->nz++; 1601 #if defined(PETSC_HAVE_CUPM) 1602 inserted = PETSC_TRUE; 1603 #endif 1604 low = i + 1; 1605 high++; 1606 nrow++; 1607 noinsert:; 1608 } 1609 a->rlen[row] = nrow; 1610 } 1611 #if defined(PETSC_HAVE_CUPM) 1612 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED && inserted) A->offloadmask = PETSC_OFFLOAD_CPU; 1613 #endif 1614 PetscFunctionReturn(PETSC_SUCCESS); 1615 } 1616 1617 PetscErrorCode MatCopy_SeqSELL(Mat A, Mat B, MatStructure str) 1618 { 1619 PetscFunctionBegin; 1620 /* If the two matrices have the same copy implementation, use fast copy. */ 1621 if (str == SAME_NONZERO_PATTERN && (A->ops->copy == B->ops->copy)) { 1622 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1623 Mat_SeqSELL *b = (Mat_SeqSELL *)B->data; 1624 1625 PetscCheck(a->sliidx[a->totalslices] == b->sliidx[b->totalslices], PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Number of nonzeros in two matrices are different"); 1626 PetscCall(PetscArraycpy(b->val, a->val, a->sliidx[a->totalslices])); 1627 } else { 1628 PetscCall(MatCopy_Basic(A, B, str)); 1629 } 1630 PetscFunctionReturn(PETSC_SUCCESS); 1631 } 1632 1633 PetscErrorCode MatSetUp_SeqSELL(Mat A) 1634 { 1635 PetscFunctionBegin; 1636 PetscCall(MatSeqSELLSetPreallocation(A, PETSC_DEFAULT, NULL)); 1637 PetscFunctionReturn(PETSC_SUCCESS); 1638 } 1639 1640 PetscErrorCode MatSeqSELLGetArray_SeqSELL(Mat A, PetscScalar *array[]) 1641 { 1642 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1643 1644 PetscFunctionBegin; 1645 *array = a->val; 1646 PetscFunctionReturn(PETSC_SUCCESS); 1647 } 1648 1649 PetscErrorCode MatSeqSELLRestoreArray_SeqSELL(Mat A, PetscScalar *array[]) 1650 { 1651 PetscFunctionBegin; 1652 PetscFunctionReturn(PETSC_SUCCESS); 1653 } 1654 1655 PetscErrorCode MatScale_SeqSELL(Mat inA, PetscScalar alpha) 1656 { 1657 Mat_SeqSELL *a = (Mat_SeqSELL *)inA->data; 1658 MatScalar *aval = a->val; 1659 PetscScalar oalpha = alpha; 1660 PetscBLASInt one = 1, size; 1661 1662 PetscFunctionBegin; 1663 PetscCall(PetscBLASIntCast(a->sliidx[a->totalslices], &size)); 1664 PetscCallBLAS("BLASscal", BLASscal_(&size, &oalpha, aval, &one)); 1665 PetscCall(PetscLogFlops(a->nz)); 1666 #if defined(PETSC_HAVE_CUPM) 1667 if (inA->offloadmask != PETSC_OFFLOAD_UNALLOCATED) inA->offloadmask = PETSC_OFFLOAD_CPU; 1668 #endif 1669 PetscFunctionReturn(PETSC_SUCCESS); 1670 } 1671 1672 PetscErrorCode MatShift_SeqSELL(Mat Y, PetscScalar a) 1673 { 1674 Mat_SeqSELL *y = (Mat_SeqSELL *)Y->data; 1675 1676 PetscFunctionBegin; 1677 if (!Y->preallocated || !y->nz) PetscCall(MatSeqSELLSetPreallocation(Y, 1, NULL)); 1678 PetscCall(MatShift_Basic(Y, a)); 1679 PetscFunctionReturn(PETSC_SUCCESS); 1680 } 1681 1682 PetscErrorCode MatSOR_SeqSELL(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 1683 { 1684 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 1685 PetscScalar *x, sum, *t; 1686 const MatScalar *idiag = NULL, *mdiag; 1687 const PetscScalar *b, *xb; 1688 PetscInt n, m = A->rmap->n, i, j, shift; 1689 const PetscInt *diag; 1690 1691 PetscFunctionBegin; 1692 its = its * lits; 1693 1694 PetscCall(MatInvertDiagonalForSOR_SeqSELL(A, omega, fshift)); 1695 1696 diag = a->diag; 1697 t = a->ssor_work; 1698 idiag = a->idiag; 1699 mdiag = a->mdiag; 1700 1701 PetscCall(VecGetArray(xx, &x)); 1702 PetscCall(VecGetArrayRead(bb, &b)); 1703 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1704 PetscCheck(flag != SOR_APPLY_UPPER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_UPPER is not implemented"); 1705 PetscCheck(flag != SOR_APPLY_LOWER, PETSC_COMM_SELF, PETSC_ERR_SUP, "SOR_APPLY_LOWER is not implemented"); 1706 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat"); 1707 1708 if (flag & SOR_ZERO_INITIAL_GUESS) { 1709 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1710 for (i = 0; i < m; i++) { 1711 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1712 sum = b[i]; 1713 n = (diag[i] - shift) / a->sliceheight; 1714 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1715 t[i] = sum; 1716 x[i] = sum * idiag[i]; 1717 } 1718 xb = t; 1719 PetscCall(PetscLogFlops(a->nz)); 1720 } else xb = b; 1721 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1722 for (i = m - 1; i >= 0; i--) { 1723 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1724 sum = xb[i]; 1725 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1726 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1727 if (xb == b) { 1728 x[i] = sum * idiag[i]; 1729 } else { 1730 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1731 } 1732 } 1733 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1734 } 1735 its--; 1736 } 1737 while (its--) { 1738 if ((flag & SOR_FORWARD_SWEEP) || (flag & SOR_LOCAL_FORWARD_SWEEP)) { 1739 for (i = 0; i < m; i++) { 1740 /* lower */ 1741 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1742 sum = b[i]; 1743 n = (diag[i] - shift) / a->sliceheight; 1744 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1745 t[i] = sum; /* save application of the lower-triangular part */ 1746 /* upper */ 1747 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1748 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1749 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1750 } 1751 xb = t; 1752 PetscCall(PetscLogFlops(2.0 * a->nz)); 1753 } else xb = b; 1754 if ((flag & SOR_BACKWARD_SWEEP) || (flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1755 for (i = m - 1; i >= 0; i--) { 1756 shift = a->sliidx[i / a->sliceheight] + i % a->sliceheight; /* starting index of the row i */ 1757 sum = xb[i]; 1758 if (xb == b) { 1759 /* whole matrix (no checkpointing available) */ 1760 n = a->rlen[i]; 1761 for (j = 0; j < n; j++) sum -= a->val[shift + a->sliceheight * j] * x[a->colidx[shift + a->sliceheight * j]]; 1762 x[i] = (1. - omega) * x[i] + (sum + mdiag[i] * x[i]) * idiag[i]; 1763 } else { /* lower-triangular part has been saved, so only apply upper-triangular */ 1764 n = a->rlen[i] - (diag[i] - shift) / a->sliceheight - 1; 1765 for (j = 1; j <= n; j++) sum -= a->val[diag[i] + a->sliceheight * j] * x[a->colidx[diag[i] + a->sliceheight * j]]; 1766 x[i] = (1. - omega) * x[i] + sum * idiag[i]; /* omega in idiag */ 1767 } 1768 } 1769 if (xb == b) { 1770 PetscCall(PetscLogFlops(2.0 * a->nz)); 1771 } else { 1772 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper */ 1773 } 1774 } 1775 } 1776 PetscCall(VecRestoreArray(xx, &x)); 1777 PetscCall(VecRestoreArrayRead(bb, &b)); 1778 PetscFunctionReturn(PETSC_SUCCESS); 1779 } 1780 1781 static struct _MatOps MatOps_Values = {MatSetValues_SeqSELL, 1782 MatGetRow_SeqSELL, 1783 MatRestoreRow_SeqSELL, 1784 MatMult_SeqSELL, 1785 /* 4*/ MatMultAdd_SeqSELL, 1786 MatMultTranspose_SeqSELL, 1787 MatMultTransposeAdd_SeqSELL, 1788 NULL, 1789 NULL, 1790 NULL, 1791 /* 10*/ NULL, 1792 NULL, 1793 NULL, 1794 MatSOR_SeqSELL, 1795 NULL, 1796 /* 15*/ MatGetInfo_SeqSELL, 1797 MatEqual_SeqSELL, 1798 MatGetDiagonal_SeqSELL, 1799 MatDiagonalScale_SeqSELL, 1800 NULL, 1801 /* 20*/ NULL, 1802 MatAssemblyEnd_SeqSELL, 1803 MatSetOption_SeqSELL, 1804 MatZeroEntries_SeqSELL, 1805 /* 24*/ NULL, 1806 NULL, 1807 NULL, 1808 NULL, 1809 NULL, 1810 /* 29*/ MatSetUp_SeqSELL, 1811 NULL, 1812 NULL, 1813 NULL, 1814 NULL, 1815 /* 34*/ MatDuplicate_SeqSELL, 1816 NULL, 1817 NULL, 1818 NULL, 1819 NULL, 1820 /* 39*/ NULL, 1821 NULL, 1822 NULL, 1823 MatGetValues_SeqSELL, 1824 MatCopy_SeqSELL, 1825 /* 44*/ NULL, 1826 MatScale_SeqSELL, 1827 MatShift_SeqSELL, 1828 NULL, 1829 NULL, 1830 /* 49*/ NULL, 1831 NULL, 1832 NULL, 1833 NULL, 1834 NULL, 1835 /* 54*/ MatFDColoringCreate_SeqXAIJ, 1836 NULL, 1837 NULL, 1838 NULL, 1839 NULL, 1840 /* 59*/ NULL, 1841 MatDestroy_SeqSELL, 1842 MatView_SeqSELL, 1843 NULL, 1844 NULL, 1845 /* 64*/ NULL, 1846 NULL, 1847 NULL, 1848 NULL, 1849 NULL, 1850 /* 69*/ NULL, 1851 NULL, 1852 NULL, 1853 MatFDColoringApply_AIJ, /* reuse the FDColoring function for AIJ */ 1854 NULL, 1855 /* 74*/ NULL, 1856 NULL, 1857 NULL, 1858 NULL, 1859 NULL, 1860 /* 79*/ NULL, 1861 NULL, 1862 NULL, 1863 NULL, 1864 NULL, 1865 /* 84*/ NULL, 1866 NULL, 1867 NULL, 1868 NULL, 1869 NULL, 1870 /* 89*/ NULL, 1871 NULL, 1872 NULL, 1873 NULL, 1874 MatConjugate_SeqSELL, 1875 /* 94*/ NULL, 1876 NULL, 1877 NULL, 1878 NULL, 1879 NULL, 1880 /* 99*/ NULL, 1881 NULL, 1882 NULL, 1883 NULL, 1884 NULL, 1885 /*104*/ MatMissingDiagonal_SeqSELL, 1886 NULL, 1887 NULL, 1888 NULL, 1889 NULL, 1890 /*109*/ NULL, 1891 NULL, 1892 NULL, 1893 NULL, 1894 NULL, 1895 /*114*/ NULL, 1896 NULL, 1897 NULL, 1898 NULL, 1899 NULL, 1900 /*119*/ NULL, 1901 NULL, 1902 NULL, 1903 NULL, 1904 NULL, 1905 /*124*/ NULL, 1906 NULL, 1907 NULL, 1908 NULL, 1909 NULL, 1910 /*129*/ MatFDColoringSetUp_SeqXAIJ, 1911 NULL, 1912 NULL, 1913 NULL, 1914 NULL, 1915 /*134*/ NULL, 1916 NULL, 1917 NULL, 1918 NULL, 1919 NULL, 1920 /*139*/ NULL, 1921 NULL, 1922 NULL, 1923 NULL, 1924 NULL, 1925 NULL}; 1926 1927 static PetscErrorCode MatStoreValues_SeqSELL(Mat mat) 1928 { 1929 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1930 1931 PetscFunctionBegin; 1932 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1933 1934 /* allocate space for values if not already there */ 1935 if (!a->saved_values) PetscCall(PetscMalloc1(a->sliidx[a->totalslices] + 1, &a->saved_values)); 1936 1937 /* copy values over */ 1938 PetscCall(PetscArraycpy(a->saved_values, a->val, a->sliidx[a->totalslices])); 1939 PetscFunctionReturn(PETSC_SUCCESS); 1940 } 1941 1942 static PetscErrorCode MatRetrieveValues_SeqSELL(Mat mat) 1943 { 1944 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1945 1946 PetscFunctionBegin; 1947 PetscCheck(a->nonew, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatSetOption(A,MAT_NEW_NONZERO_LOCATIONS,PETSC_FALSE);first"); 1948 PetscCheck(a->saved_values, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Must call MatStoreValues(A);first"); 1949 PetscCall(PetscArraycpy(a->val, a->saved_values, a->sliidx[a->totalslices])); 1950 PetscFunctionReturn(PETSC_SUCCESS); 1951 } 1952 1953 static PetscErrorCode MatSeqSELLGetFillRatio_SeqSELL(Mat mat, PetscReal *ratio) 1954 { 1955 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1956 1957 PetscFunctionBegin; 1958 if (a->totalslices && a->sliidx[a->totalslices]) { 1959 *ratio = (PetscReal)(a->sliidx[a->totalslices] - a->nz) / a->sliidx[a->totalslices]; 1960 } else { 1961 *ratio = 0.0; 1962 } 1963 PetscFunctionReturn(PETSC_SUCCESS); 1964 } 1965 1966 static PetscErrorCode MatSeqSELLGetMaxSliceWidth_SeqSELL(Mat mat, PetscInt *slicewidth) 1967 { 1968 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1969 PetscInt i, current_slicewidth; 1970 1971 PetscFunctionBegin; 1972 *slicewidth = 0; 1973 for (i = 0; i < a->totalslices; i++) { 1974 current_slicewidth = (a->sliidx[i + 1] - a->sliidx[i]) / a->sliceheight; 1975 if (current_slicewidth > *slicewidth) *slicewidth = current_slicewidth; 1976 } 1977 PetscFunctionReturn(PETSC_SUCCESS); 1978 } 1979 1980 static PetscErrorCode MatSeqSELLGetAvgSliceWidth_SeqSELL(Mat mat, PetscReal *slicewidth) 1981 { 1982 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1983 1984 PetscFunctionBegin; 1985 *slicewidth = 0; 1986 if (a->totalslices) *slicewidth = (PetscReal)a->sliidx[a->totalslices] / a->sliceheight / a->totalslices; 1987 PetscFunctionReturn(PETSC_SUCCESS); 1988 } 1989 1990 static PetscErrorCode MatSeqSELLGetVarSliceSize_SeqSELL(Mat mat, PetscReal *variance) 1991 { 1992 Mat_SeqSELL *a = (Mat_SeqSELL *)mat->data; 1993 PetscReal mean; 1994 PetscInt i, totalslices = a->totalslices, *sliidx = a->sliidx; 1995 1996 PetscFunctionBegin; 1997 *variance = 0; 1998 if (totalslices) { 1999 mean = (PetscReal)sliidx[totalslices] / totalslices; 2000 for (i = 1; i <= totalslices; i++) *variance += ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) * ((PetscReal)(sliidx[i] - sliidx[i - 1]) - mean) / totalslices; 2001 } 2002 PetscFunctionReturn(PETSC_SUCCESS); 2003 } 2004 2005 static PetscErrorCode MatSeqSELLSetSliceHeight_SeqSELL(Mat A, PetscInt sliceheight) 2006 { 2007 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2008 2009 PetscFunctionBegin; 2010 if (A->preallocated) PetscFunctionReturn(PETSC_SUCCESS); 2011 PetscCheck(a->sliceheight <= 0 || a->sliceheight == sliceheight, PETSC_COMM_SELF, PETSC_ERR_SUP, "Cannot change slice height %" PetscInt_FMT " to %" PetscInt_FMT, a->sliceheight, sliceheight); 2012 a->sliceheight = sliceheight; 2013 #if defined(PETSC_HAVE_CUPM) 2014 PetscCheck(PetscMax(DEVICE_MEM_ALIGN, sliceheight) % PetscMin(DEVICE_MEM_ALIGN, sliceheight) == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "The slice height is not compatible with DEVICE_MEM_ALIGN (one must be divisible by the other) %" PetscInt_FMT, sliceheight); 2015 #endif 2016 PetscFunctionReturn(PETSC_SUCCESS); 2017 } 2018 2019 /*@ 2020 MatSeqSELLGetFillRatio - returns a ratio that indicates the irregularity of the matrix. 2021 2022 Not Collective 2023 2024 Input Parameter: 2025 . A - a MATSEQSELL matrix 2026 2027 Output Parameter: 2028 . ratio - ratio of number of padded zeros to number of allocated elements 2029 2030 Level: intermediate 2031 2032 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2033 @*/ 2034 PetscErrorCode MatSeqSELLGetFillRatio(Mat A, PetscReal *ratio) 2035 { 2036 PetscFunctionBegin; 2037 PetscUseMethod(A, "MatSeqSELLGetFillRatio_C", (Mat, PetscReal *), (A, ratio)); 2038 PetscFunctionReturn(PETSC_SUCCESS); 2039 } 2040 2041 /*@ 2042 MatSeqSELLGetMaxSliceWidth - returns the maximum slice width. 2043 2044 Not Collective 2045 2046 Input Parameter: 2047 . A - a MATSEQSELL matrix 2048 2049 Output Parameter: 2050 . slicewidth - maximum slice width 2051 2052 Level: intermediate 2053 2054 .seealso: `MATSEQSELL`, `MatSeqSELLGetAvgSliceWidth()` 2055 @*/ 2056 PetscErrorCode MatSeqSELLGetMaxSliceWidth(Mat A, PetscInt *slicewidth) 2057 { 2058 PetscFunctionBegin; 2059 PetscUseMethod(A, "MatSeqSELLGetMaxSliceWidth_C", (Mat, PetscInt *), (A, slicewidth)); 2060 PetscFunctionReturn(PETSC_SUCCESS); 2061 } 2062 2063 /*@ 2064 MatSeqSELLGetAvgSliceWidth - returns the average slice width. 2065 2066 Not Collective 2067 2068 Input Parameter: 2069 . A - a MATSEQSELL matrix 2070 2071 Output Parameter: 2072 . slicewidth - average slice width 2073 2074 Level: intermediate 2075 2076 .seealso: `MATSEQSELL`, `MatSeqSELLGetMaxSliceWidth()` 2077 @*/ 2078 PetscErrorCode MatSeqSELLGetAvgSliceWidth(Mat A, PetscReal *slicewidth) 2079 { 2080 PetscFunctionBegin; 2081 PetscUseMethod(A, "MatSeqSELLGetAvgSliceWidth_C", (Mat, PetscReal *), (A, slicewidth)); 2082 PetscFunctionReturn(PETSC_SUCCESS); 2083 } 2084 2085 /*@ 2086 MatSeqSELLSetSliceHeight - sets the slice height. 2087 2088 Not Collective 2089 2090 Input Parameters: 2091 + A - a MATSEQSELL matrix 2092 - sliceheight - slice height 2093 2094 Notes: 2095 You cannot change the slice height once it have been set. 2096 2097 The slice height must be set before MatSetUp() or MatXXXSetPreallocation() is called. 2098 2099 Level: intermediate 2100 2101 .seealso: `MATSEQSELL`, `MatSeqSELLGetVarSliceSize()` 2102 @*/ 2103 PetscErrorCode MatSeqSELLSetSliceHeight(Mat A, PetscInt sliceheight) 2104 { 2105 PetscFunctionBegin; 2106 PetscUseMethod(A, "MatSeqSELLSetSliceHeight_C", (Mat, PetscInt), (A, sliceheight)); 2107 PetscFunctionReturn(PETSC_SUCCESS); 2108 } 2109 2110 /*@ 2111 MatSeqSELLGetVarSliceSize - returns the variance of the slice size. 2112 2113 Not Collective 2114 2115 Input Parameter: 2116 . A - a MATSEQSELL matrix 2117 2118 Output Parameter: 2119 . variance - variance of the slice size 2120 2121 Level: intermediate 2122 2123 .seealso: `MATSEQSELL`, `MatSeqSELLSetSliceHeight()` 2124 @*/ 2125 PetscErrorCode MatSeqSELLGetVarSliceSize(Mat A, PetscReal *variance) 2126 { 2127 PetscFunctionBegin; 2128 PetscUseMethod(A, "MatSeqSELLGetVarSliceSize_C", (Mat, PetscReal *), (A, variance)); 2129 PetscFunctionReturn(PETSC_SUCCESS); 2130 } 2131 2132 #if defined(PETSC_HAVE_CUDA) 2133 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLCUDA(Mat); 2134 #endif 2135 #if defined(PETSC_HAVE_HIP) 2136 PETSC_EXTERN PetscErrorCode MatConvert_SeqSELL_SeqSELLHIP(Mat); 2137 #endif 2138 2139 PETSC_EXTERN PetscErrorCode MatCreate_SeqSELL(Mat B) 2140 { 2141 Mat_SeqSELL *b; 2142 PetscMPIInt size; 2143 2144 PetscFunctionBegin; 2145 PetscCall(PetscCitationsRegister(citation, &cited)); 2146 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 2147 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Comm must be of size 1"); 2148 2149 PetscCall(PetscNew(&b)); 2150 2151 B->data = (void *)b; 2152 B->ops[0] = MatOps_Values; 2153 2154 b->row = NULL; 2155 b->col = NULL; 2156 b->icol = NULL; 2157 b->reallocs = 0; 2158 b->ignorezeroentries = PETSC_FALSE; 2159 b->roworiented = PETSC_TRUE; 2160 b->nonew = 0; 2161 b->diag = NULL; 2162 b->solve_work = NULL; 2163 B->spptr = NULL; 2164 b->saved_values = NULL; 2165 b->idiag = NULL; 2166 b->mdiag = NULL; 2167 b->ssor_work = NULL; 2168 b->omega = 1.0; 2169 b->fshift = 0.0; 2170 b->keepnonzeropattern = PETSC_FALSE; 2171 b->sliceheight = 0; 2172 2173 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQSELL)); 2174 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetArray_C", MatSeqSELLGetArray_SeqSELL)); 2175 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLRestoreArray_C", MatSeqSELLRestoreArray_SeqSELL)); 2176 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatStoreValues_C", MatStoreValues_SeqSELL)); 2177 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatRetrieveValues_C", MatRetrieveValues_SeqSELL)); 2178 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetPreallocation_C", MatSeqSELLSetPreallocation_SeqSELL)); 2179 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqaij_C", MatConvert_SeqSELL_SeqAIJ)); 2180 #if defined(PETSC_HAVE_CUDA) 2181 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellcuda_C", MatConvert_SeqSELL_SeqSELLCUDA)); 2182 #endif 2183 #if defined(PETSC_HAVE_HIP) 2184 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqsell_seqsellhip_C", MatConvert_SeqSELL_SeqSELLHIP)); 2185 #endif 2186 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetFillRatio_C", MatSeqSELLGetFillRatio_SeqSELL)); 2187 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetMaxSliceWidth_C", MatSeqSELLGetMaxSliceWidth_SeqSELL)); 2188 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetAvgSliceWidth_C", MatSeqSELLGetAvgSliceWidth_SeqSELL)); 2189 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLGetVarSliceSize_C", MatSeqSELLGetVarSliceSize_SeqSELL)); 2190 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatSeqSELLSetSliceHeight_C", MatSeqSELLSetSliceHeight_SeqSELL)); 2191 2192 PetscObjectOptionsBegin((PetscObject)B); 2193 { 2194 PetscInt newsh = -1; 2195 PetscBool flg; 2196 #if defined(PETSC_HAVE_CUPM) 2197 PetscInt chunksize = 0; 2198 #endif 2199 2200 PetscCall(PetscOptionsInt("-mat_sell_slice_height", "Set the slice height used to store SELL matrix", "MatSELLSetSliceHeight", newsh, &newsh, &flg)); 2201 if (flg) PetscCall(MatSeqSELLSetSliceHeight(B, newsh)); 2202 #if defined(PETSC_HAVE_CUPM) 2203 PetscCall(PetscOptionsInt("-mat_sell_chunk_size", "Set the chunksize for load-balanced CUDA/HIP kernels. Choices include 64,128,256,512,1024", NULL, chunksize, &chunksize, &flg)); 2204 if (flg) { 2205 PetscCheck(chunksize >= 64 && chunksize <= 1024 && chunksize % 64 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "chunksize must be a number in {64,128,256,512,1024}: value %" PetscInt_FMT, chunksize); 2206 b->chunksize = chunksize; 2207 } 2208 #endif 2209 } 2210 PetscOptionsEnd(); 2211 PetscFunctionReturn(PETSC_SUCCESS); 2212 } 2213 2214 /* 2215 Given a matrix generated with MatGetFactor() duplicates all the information in A into B 2216 */ 2217 static PetscErrorCode MatDuplicateNoCreate_SeqSELL(Mat C, Mat A, MatDuplicateOption cpvalues, PetscBool mallocmatspace) 2218 { 2219 Mat_SeqSELL *c = (Mat_SeqSELL *)C->data, *a = (Mat_SeqSELL *)A->data; 2220 PetscInt i, m = A->rmap->n; 2221 PetscInt totalslices = a->totalslices; 2222 2223 PetscFunctionBegin; 2224 C->factortype = A->factortype; 2225 c->row = NULL; 2226 c->col = NULL; 2227 c->icol = NULL; 2228 c->reallocs = 0; 2229 C->assembled = PETSC_TRUE; 2230 2231 PetscCall(PetscLayoutReference(A->rmap, &C->rmap)); 2232 PetscCall(PetscLayoutReference(A->cmap, &C->cmap)); 2233 2234 c->sliceheight = a->sliceheight; 2235 PetscCall(PetscMalloc1(c->sliceheight * totalslices, &c->rlen)); 2236 PetscCall(PetscMalloc1(totalslices + 1, &c->sliidx)); 2237 2238 for (i = 0; i < m; i++) c->rlen[i] = a->rlen[i]; 2239 for (i = 0; i < totalslices + 1; i++) c->sliidx[i] = a->sliidx[i]; 2240 2241 /* allocate the matrix space */ 2242 if (mallocmatspace) { 2243 PetscCall(PetscMalloc2(a->maxallocmat, &c->val, a->maxallocmat, &c->colidx)); 2244 2245 c->singlemalloc = PETSC_TRUE; 2246 2247 if (m > 0) { 2248 PetscCall(PetscArraycpy(c->colidx, a->colidx, a->maxallocmat)); 2249 if (cpvalues == MAT_COPY_VALUES) { 2250 PetscCall(PetscArraycpy(c->val, a->val, a->maxallocmat)); 2251 } else { 2252 PetscCall(PetscArrayzero(c->val, a->maxallocmat)); 2253 } 2254 } 2255 } 2256 2257 c->ignorezeroentries = a->ignorezeroentries; 2258 c->roworiented = a->roworiented; 2259 c->nonew = a->nonew; 2260 if (a->diag) { 2261 PetscCall(PetscMalloc1(m, &c->diag)); 2262 for (i = 0; i < m; i++) c->diag[i] = a->diag[i]; 2263 } else c->diag = NULL; 2264 2265 c->solve_work = NULL; 2266 c->saved_values = NULL; 2267 c->idiag = NULL; 2268 c->ssor_work = NULL; 2269 c->keepnonzeropattern = a->keepnonzeropattern; 2270 c->free_val = PETSC_TRUE; 2271 c->free_colidx = PETSC_TRUE; 2272 2273 c->maxallocmat = a->maxallocmat; 2274 c->maxallocrow = a->maxallocrow; 2275 c->rlenmax = a->rlenmax; 2276 c->nz = a->nz; 2277 C->preallocated = PETSC_TRUE; 2278 2279 c->nonzerorowcnt = a->nonzerorowcnt; 2280 C->nonzerostate = A->nonzerostate; 2281 2282 PetscCall(PetscFunctionListDuplicate(((PetscObject)A)->qlist, &((PetscObject)C)->qlist)); 2283 PetscFunctionReturn(PETSC_SUCCESS); 2284 } 2285 2286 PetscErrorCode MatDuplicate_SeqSELL(Mat A, MatDuplicateOption cpvalues, Mat *B) 2287 { 2288 PetscFunctionBegin; 2289 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2290 PetscCall(MatSetSizes(*B, A->rmap->n, A->cmap->n, A->rmap->n, A->cmap->n)); 2291 if (!(A->rmap->n % A->rmap->bs) && !(A->cmap->n % A->cmap->bs)) PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2292 PetscCall(MatSetType(*B, ((PetscObject)A)->type_name)); 2293 PetscCall(MatDuplicateNoCreate_SeqSELL(*B, A, cpvalues, PETSC_TRUE)); 2294 PetscFunctionReturn(PETSC_SUCCESS); 2295 } 2296 2297 /*MC 2298 MATSEQSELL - MATSEQSELL = "seqsell" - A matrix type to be used for sequential sparse matrices, 2299 based on the sliced Ellpack format, {cite}`zhangellpack2018` 2300 2301 Options Database Key: 2302 . -mat_type seqsell - sets the matrix type to "`MATSEQELL` during a call to `MatSetFromOptions()` 2303 2304 Level: beginner 2305 2306 .seealso: `Mat`, `MatCreateSeqSELL()`, `MATSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATAIJ`, `MATMPIAIJ` 2307 M*/ 2308 2309 /*MC 2310 MATSELL - MATSELL = "sell" - A matrix type to be used for sparse matrices, {cite}`zhangellpack2018` 2311 2312 This matrix type is identical to `MATSEQSELL` when constructed with a single process communicator, 2313 and `MATMPISELL` otherwise. As a result, for single process communicators, 2314 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2315 for communicators controlling multiple processes. It is recommended that you call both of 2316 the above preallocation routines for simplicity. 2317 2318 Options Database Key: 2319 . -mat_type sell - sets the matrix type to "sell" during a call to MatSetFromOptions() 2320 2321 Level: beginner 2322 2323 Notes: 2324 This format is only supported for real scalars, double precision, and 32-bit indices (the defaults). 2325 2326 It can provide better performance on Intel and AMD processes with AVX2 or AVX512 support for matrices that have a similar number of 2327 non-zeros in contiguous groups of rows. However if the computation is memory bandwidth limited it may not provide much improvement. 2328 2329 Developer Notes: 2330 On Intel (and AMD) systems some of the matrix operations use SIMD (AVX) instructions to achieve higher performance. 2331 2332 The sparse matrix format is as follows. For simplicity we assume a slice size of 2, it is actually 8 2333 .vb 2334 (2 0 3 4) 2335 Consider the matrix A = (5 0 6 0) 2336 (0 0 7 8) 2337 (0 0 9 9) 2338 2339 symbolically the Ellpack format can be written as 2340 2341 (2 3 4 |) (0 2 3 |) 2342 v = (5 6 0 |) colidx = (0 2 2 |) 2343 -------- --------- 2344 (7 8 |) (2 3 |) 2345 (9 9 |) (2 3 |) 2346 2347 The data for 2 contiguous rows of the matrix are stored together (in column-major format) (with any left-over rows handled as a special case). 2348 Any of the rows in a slice fewer columns than the rest of the slice (row 1 above) are padded with a previous valid column in their "extra" colidx[] locations and 2349 zeros in their "extra" v locations so that the matrix operations do not need special code to handle different length rows within the 2 rows in a slice. 2350 2351 The one-dimensional representation of v used in the code is (2 5 3 6 4 0 7 9 8 9) and for colidx is (0 0 2 2 3 2 2 2 3 3) 2352 2353 .ve 2354 2355 See `MatMult_SeqSELL()` for how this format is used with the SIMD operations to achieve high performance. 2356 2357 .seealso: `Mat`, `MatCreateSeqSELL()`, `MatCreateSeqAIJ()`, `MatCreateSELL()`, `MATSEQSELL`, `MATMPISELL`, `MATSEQAIJ`, `MATMPIAIJ`, `MATAIJ` 2358 M*/ 2359 2360 /*@ 2361 MatCreateSeqSELL - Creates a sparse matrix in `MATSEQSELL` format. 2362 2363 Collective 2364 2365 Input Parameters: 2366 + comm - MPI communicator, set to `PETSC_COMM_SELF` 2367 . m - number of rows 2368 . n - number of columns 2369 . rlenmax - maximum number of nonzeros in a row, ignored if `rlen` is provided 2370 - rlen - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL 2371 2372 Output Parameter: 2373 . A - the matrix 2374 2375 Level: intermediate 2376 2377 Notes: 2378 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 2379 MatXXXXSetPreallocation() paradigm instead of this routine directly. 2380 [MatXXXXSetPreallocation() is, for example, `MatSeqSELLSetPreallocation()`] 2381 2382 Specify the preallocated storage with either `rlenmax` or `rlen` (not both). 2383 Set `rlenmax` = `PETSC_DEFAULT` and `rlen` = `NULL` for PETSc to control dynamic memory 2384 allocation. 2385 2386 .seealso: `Mat`, `MATSEQSELL`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MatSeqSELLSetPreallocation()`, `MATSELL`, `MATMPISELL` 2387 @*/ 2388 PetscErrorCode MatCreateSeqSELL(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt rlenmax, const PetscInt rlen[], Mat *A) 2389 { 2390 PetscFunctionBegin; 2391 PetscCall(MatCreate(comm, A)); 2392 PetscCall(MatSetSizes(*A, m, n, m, n)); 2393 PetscCall(MatSetType(*A, MATSEQSELL)); 2394 PetscCall(MatSeqSELLSetPreallocation_SeqSELL(*A, rlenmax, rlen)); 2395 PetscFunctionReturn(PETSC_SUCCESS); 2396 } 2397 2398 PetscErrorCode MatEqual_SeqSELL(Mat A, Mat B, PetscBool *flg) 2399 { 2400 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data, *b = (Mat_SeqSELL *)B->data; 2401 PetscInt totalslices = a->totalslices; 2402 2403 PetscFunctionBegin; 2404 /* If the matrix dimensions are not equal,or no of nonzeros */ 2405 if ((A->rmap->n != B->rmap->n) || (A->cmap->n != B->cmap->n) || (a->nz != b->nz) || (a->rlenmax != b->rlenmax)) { 2406 *flg = PETSC_FALSE; 2407 PetscFunctionReturn(PETSC_SUCCESS); 2408 } 2409 /* if the a->colidx are the same */ 2410 PetscCall(PetscArraycmp(a->colidx, b->colidx, a->sliidx[totalslices], flg)); 2411 if (!*flg) PetscFunctionReturn(PETSC_SUCCESS); 2412 /* if a->val are the same */ 2413 PetscCall(PetscArraycmp(a->val, b->val, a->sliidx[totalslices], flg)); 2414 PetscFunctionReturn(PETSC_SUCCESS); 2415 } 2416 2417 PetscErrorCode MatConjugate_SeqSELL(Mat A) 2418 { 2419 #if defined(PETSC_USE_COMPLEX) 2420 Mat_SeqSELL *a = (Mat_SeqSELL *)A->data; 2421 PetscInt i; 2422 PetscScalar *val = a->val; 2423 2424 PetscFunctionBegin; 2425 for (i = 0; i < a->sliidx[a->totalslices]; i++) val[i] = PetscConj(val[i]); 2426 #if defined(PETSC_HAVE_CUPM) 2427 if (A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) A->offloadmask = PETSC_OFFLOAD_CPU; 2428 #endif 2429 #else 2430 PetscFunctionBegin; 2431 #endif 2432 PetscFunctionReturn(PETSC_SUCCESS); 2433 } 2434